euleros inclusion category: feature feature: Classify page hotness through page table scanning bugzilla: NA CVE: NA
-------------------------------------------------
This patch proposes the etmem feature to openEuler. etmem periodically scans the vma segments of the target process, by walking its page table and check access bit of each page, before reporting the scan results to user space, so that we can better classify hotness of pages and further migrate hot ones to fast memory tier and cold ones to slow memory tier.
Signed-off-by: Fengguang Wu fengguang.wu@intel.com Signed-off-by: yanxiaodan yanxiaodan@huawei.com Signed-off-by: Feilong Lin linfeilong@huawei.com Signed-off-by: geruijun geruijun@huawei.com Signed-off-by: liubo liubo254@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Signed-off-by: Yuchen Tang tangyuchen5@huawei.com --- arch/arm64/configs/openeuler_defconfig | 2 + arch/x86/configs/openeuler_defconfig | 2 + fs/proc/Makefile | 2 + fs/proc/base.c | 6 + fs/proc/etmem_proc.c | 122 ++ fs/proc/etmem_scan.c | 1440 ++++++++++++++++++++++++ fs/proc/etmem_scan.h | 149 +++ fs/proc/internal.h | 3 + include/linux/etmem.h | 27 + include/linux/mm.h | 4 + include/linux/mm_types.h | 6 + include/uapi/asm-generic/mman-common.h | 4 + mm/Kconfig | 23 + mm/madvise.c | 16 + mm/pagewalk.c | 3 + virt/kvm/kvm_main.c | 6 + 16 files changed, 1815 insertions(+) create mode 100644 fs/proc/etmem_proc.c create mode 100644 fs/proc/etmem_scan.c create mode 100644 fs/proc/etmem_scan.h create mode 100644 include/linux/etmem.h
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 9d2f717c1f7c..01a5c61cebbb 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -1071,6 +1071,8 @@ CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y CONFIG_USE_PERCPU_NUMA_NODE_ID=y CONFIG_HAVE_SETUP_PER_CPU_AREA=y +CONFIG_ETMEM_SCAN=m +CONFIG_ETMEM=y CONFIG_CMA=y # CONFIG_CMA_DEBUG is not set # CONFIG_CMA_DEBUGFS is not set diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index b40100e16683..dafa73172937 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -1093,6 +1093,8 @@ CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y CONFIG_USE_PERCPU_NUMA_NODE_ID=y CONFIG_HAVE_SETUP_PER_CPU_AREA=y +CONFIG_ETMEM_SCAN=m +CONFIG_ETMEM=y # CONFIG_CMA is not set CONFIG_MEM_SOFT_DIRTY=y CONFIG_GENERIC_EARLY_IOREMAP=y diff --git a/fs/proc/Makefile b/fs/proc/Makefile index bd08616ed8ba..b9a7bc7d8a75 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -34,3 +34,5 @@ proc-$(CONFIG_PROC_VMCORE) += vmcore.o proc-$(CONFIG_PRINTK) += kmsg.o proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o proc-$(CONFIG_BOOT_CONFIG) += bootconfig.o +obj-$(CONFIG_ETMEM_SCAN) += etmem_scan.o +proc-${CONFIG_ETMEM} += etmem_proc.o diff --git a/fs/proc/base.c b/fs/proc/base.c index ffd54617c354..e634c61ba1ac 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3285,6 +3285,9 @@ static const struct pid_entry tgid_base_stuff[] = { REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif +#ifdef CONFIG_ETMEM + REG("idle_pages", 0600, proc_mm_idle_operations), +#endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), #endif @@ -3634,6 +3637,9 @@ static const struct pid_entry tid_base_stuff[] = { REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif +#ifdef CONFIG_ETMEM + REG("idle_pages", 0600, proc_mm_idle_operations), +#endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), #endif diff --git a/fs/proc/etmem_proc.c b/fs/proc/etmem_proc.c new file mode 100644 index 000000000000..edacb9260345 --- /dev/null +++ b/fs/proc/etmem_proc.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/seq_file.h> +#include <linux/pagemap.h> +#include <linux/mempolicy.h> +#include <linux/swap.h> +#include <linux/sched/mm.h> +#include <linux/mmu_notifier.h> +#include <linux/page_idle.h> +#include <linux/uaccess.h> +#include <linux/pkeys.h> +#include <linux/module.h> +#include <linux/ptrace.h> +#include <linux/etmem.h> + +#include "internal.h" + +static DEFINE_SPINLOCK(scan_lock); + +static int page_scan_lock(struct file *file, int is_lock, struct file_lock *flock) +{ + if (is_lock) + spin_lock(&scan_lock); + else + spin_unlock(&scan_lock); + + return 0; +} + +/* will be filled when kvm_ept_idle module loads */ +struct file_operations proc_page_scan_operations = { + .flock = page_scan_lock, +}; +EXPORT_SYMBOL_GPL(proc_page_scan_operations); + +static ssize_t mm_idle_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct mm_struct *mm = file->private_data; + int ret = 0; + + if (!mm || !mmget_not_zero(mm)) { + ret = -ESRCH; + return ret; + } + if (proc_page_scan_operations.read) + ret = proc_page_scan_operations.read(file, buf, count, ppos); + + mmput(mm); + return ret; +} + +static int mm_idle_open(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = NULL; + struct module *module = NULL; + int ret = -1; + + if (!file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + page_scan_lock(NULL, 1, NULL); + module = proc_page_scan_operations.owner; + if (module != NULL && try_module_get(module)) + ret = 0; + page_scan_lock(NULL, 0, NULL); + if (ret != 0) { + /* no scan ko installed, avoid to return valid file */ + return -ENODEV; + } + + mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(mm)) { + module_put(module); + return PTR_ERR(mm); + } + + file->private_data = mm; + + if (proc_page_scan_operations.open) + ret = proc_page_scan_operations.open(inode, file); + + if (ret != 0) + module_put(module); + + return ret; +} + +static int mm_idle_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + int ret = 0; + + if (mm) { + if (!mm_kvm(mm)) + flush_tlb_mm(mm); + mmdrop(mm); + } + + if (proc_page_scan_operations.release) + ret = proc_page_scan_operations.release(inode, file); + + if (proc_page_scan_operations.owner) + module_put(proc_page_scan_operations.owner); + + return ret; +} + +static long mm_idle_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + if (proc_page_scan_operations.unlocked_ioctl) + return proc_page_scan_operations.unlocked_ioctl(filp, cmd, arg); + + return 0; +} + +const struct file_operations proc_mm_idle_operations = { + .llseek = mem_lseek, /* borrow this */ + .read = mm_idle_read, + .open = mm_idle_open, + .release = mm_idle_release, + .unlocked_ioctl = mm_idle_ioctl, +}; diff --git a/fs/proc/etmem_scan.c b/fs/proc/etmem_scan.c new file mode 100644 index 000000000000..1e9677df9401 --- /dev/null +++ b/fs/proc/etmem_scan.c @@ -0,0 +1,1440 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/pagemap.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/proc_fs.h> +#include <linux/uaccess.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/bitmap.h> +#include <linux/sched/mm.h> +#include <linux/version.h> +#include <linux/module.h> +#include <linux/io.h> +#include <linux/pagewalk.h> +#include <linux/uaccess.h> +#include <linux/align.h> +#include <asm/cacheflush.h> +#include <asm/page.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <linux/huge_mm.h> +#include <linux/etmem.h> +#ifdef CONFIG_ARM64 +#include <asm/pgtable-types.h> +#include <asm/memory.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_arm.h> +#include <asm/kvm_pgtable.h> +#include <asm/stage2_pgtable.h> +#endif +#include "etmem_scan.h" +#include <linux/hugetlb_inline.h> + +#ifdef CONFIG_X86_64 +/* + * Fallback to false for kernel doens't support KVM_INVALID_SPTE + * ept_idle can sitll work in this situation but the scan accuracy may drop, + * depends on the access frequences of the workload. + */ +#ifdef KVM_INVALID_SPTE +#define KVM_CHECK_INVALID_SPTE(val) ((val) == KVM_INVALID_SPTE) +#else +#define KVM_CHECK_INVALID_SPTE(val) (0) +#endif + +# define kvm_arch_mmu_pointer(vcpu) (vcpu->arch.mmu) +# define kvm_mmu_ad_disabled(mmu) (mmu->mmu_role.base.ad_disabled) +#endif /*CONFIG_X86_64*/ + +#ifdef CONFIG_ARM64 +#define if_pmd_thp_or_huge(pmd) (if_pmd_huge(pmd) || pmd_trans_huge(pmd)) +#endif /* CONFIG_ARM64 */ + +#ifdef DEBUG + +#define debug_printk trace_printk + +#define set_restart_gpa(val, note) ({ \ + unsigned long old_val = pic->restart_gpa; \ + pic->restart_gpa = (val); \ + trace_printk("restart_gpa=%lx %luK %s %s %d\n", \ + (val), (pic->restart_gpa - old_val) >> 10, \ + note, __func__, __LINE__); \ +}) + +#define set_next_hva(val, note) ({ \ + unsigned long old_val = pic->next_hva; \ + pic->next_hva = (val); \ + trace_printk(" next_hva=%lx %luK %s %s %d\n", \ + (val), (pic->next_hva - old_val) >> 10, \ + note, __func__, __LINE__); \ +}) + +#else + +#define debug_printk(...) + +#define set_restart_gpa(val, note) ({ \ + pic->restart_gpa = (val); \ +}) + +#define set_next_hva(val, note) ({ \ + pic->next_hva = (val); \ +}) + +#endif + +#define RET_RESCAN_FLAG 0x10000 + +/* error return IDLE_PAGE_TYPE_MAX or return valid page type */ +enum ProcIdlePageType (*vm_handle_pte_hole)(unsigned long addr, + unsigned long next, int depth, struct mm_walk *walk) = NULL; +EXPORT_SYMBOL_GPL(vm_handle_pte_hole); + +static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end, + phys_addr_t size) +{ + phys_addr_t boundary = ALIGN_DOWN(addr + size, size); + + return (boundary - 1 < end - 1) ? boundary : end; +} + +static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); + + return __stage2_range_addr_end(addr, end, size); +} + + +static int set_walk_step(const char *val, const struct kernel_param *kp) +{ + int ret; + unsigned int n; + + ret = kstrtouint(val, 0, &n); + if (ret != 0 || n == 0) + return -EINVAL; + + return param_set_uint(val, kp); +} + +static struct kernel_param_ops walk_step_ops = { + .set = set_walk_step, + .get = param_get_uint, +}; + +static unsigned int __read_mostly walk_step = 512; // in PAGE_SIZE +module_param_cb(walk_step, &walk_step_ops, &walk_step, 0644); + +static unsigned int resched_step = 10; +module_param(resched_step, uint, 0644); + +static unsigned long pagetype_size[16] = { + [PTE_ACCESSED] = PAGE_SIZE, /* 4k page */ + [PMD_ACCESSED] = PMD_SIZE, /* 2M page */ + [PUD_PRESENT] = PUD_SIZE, /* 1G page */ + + [PTE_DIRTY_M] = PAGE_SIZE, + [PMD_DIRTY_M] = PMD_SIZE, + + [PTE_IDLE] = PAGE_SIZE, + [PMD_IDLE] = PMD_SIZE, + [PMD_IDLE_PTES] = PMD_SIZE, + + [PTE_HOLE] = PAGE_SIZE, + [PMD_HOLE] = PMD_SIZE, +}; + +static void u64_to_u8(uint64_t n, uint8_t *p) +{ + p += sizeof(uint64_t) - 1; + + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p = n; +} + +static void dump_pic(struct page_idle_ctrl *pic) +{ + debug_printk("page_idle_ctrl: pie_read=%d pie_read_max=%d", + pic->pie_read, + pic->pie_read_max); + debug_printk(" buf_size=%d bytes_copied=%d next_hva=%pK", + pic->buf_size, + pic->bytes_copied, + pic->next_hva); + debug_printk(" restart_gpa=%pK pa_to_hva=%pK\n", + pic->restart_gpa, + pic->gpa_to_hva); +} + +#ifdef CONFIG_ARM64 +static int if_pmd_huge(pmd_t pmd) +{ + return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT); +} + +static int if_pud_huge(pud_t pud) +{ +#ifndef __PAGETABLE_PMD_FOLDED + return pud_val(pud) && !(pud_val(pud) & PUD_TABLE_BIT); +#else + return 0; +#endif +} +#endif + +static void pic_report_addr(struct page_idle_ctrl *pic, unsigned long addr) +{ + unsigned long hva; + + pic->kpie[pic->pie_read++] = PIP_CMD_SET_HVA; + hva = addr; + u64_to_u8(hva, &pic->kpie[pic->pie_read]); + pic->pie_read += sizeof(uint64_t); + dump_pic(pic); +} + +static int pic_add_page(struct page_idle_ctrl *pic, + unsigned long addr, + unsigned long next, + enum ProcIdlePageType page_type) +{ + unsigned long page_size = pagetype_size[page_type]; + + dump_pic(pic); + + /* align kernel/user vision of cursor position */ + next = round_up(next, page_size); + + if (!pic->pie_read || + addr + pic->gpa_to_hva != pic->next_hva) { + /* merge hole */ + if (page_type == PTE_HOLE || + page_type == PMD_HOLE) { + set_restart_gpa(next, "PTE_HOLE|PMD_HOLE"); + return 0; + } + + if (addr + pic->gpa_to_hva < pic->next_hva) { + debug_printk("page_idle: addr moves backwards\n"); + WARN_ONCE(1, "page_idle: addr moves backwards"); + } + + if (pic->pie_read + sizeof(uint64_t) + 2 >= pic->pie_read_max) { + set_restart_gpa(addr, "PAGE_IDLE_KBUF_FULL"); + return PAGE_IDLE_KBUF_FULL; + } + + pic_report_addr(pic, round_down(addr, page_size) + + pic->gpa_to_hva); + } else { + if (PIP_TYPE(pic->kpie[pic->pie_read - 1]) == page_type && + PIP_SIZE(pic->kpie[pic->pie_read - 1]) < 0xF) { + set_next_hva(next + pic->gpa_to_hva, "IN-PLACE INC"); + set_restart_gpa(next, "IN-PLACE INC"); + pic->kpie[pic->pie_read - 1]++; + WARN_ONCE(page_size < next-addr, "next-addr too large"); + return 0; + } + if (pic->pie_read >= pic->pie_read_max) { + set_restart_gpa(addr, "PAGE_IDLE_KBUF_FULL"); + return PAGE_IDLE_KBUF_FULL; + } + } + + set_next_hva(next + pic->gpa_to_hva, "NEW-ITEM"); + set_restart_gpa(next, "NEW-ITEM"); + pic->kpie[pic->pie_read] = PIP_COMPOSE(page_type, 1); + pic->pie_read++; + + return 0; +} + +static int init_page_idle_ctrl_buffer(struct page_idle_ctrl *pic) +{ + pic->pie_read = 0; + pic->pie_read_max = min(PAGE_IDLE_KBUF_SIZE, + pic->buf_size - pic->bytes_copied); + /* reserve space for PIP_CMD_SET_HVA in the end */ + pic->pie_read_max -= sizeof(uint64_t) + 1; + + /* + * Align with PAGE_IDLE_KBUF_FULL + * logic in pic_add_page(), to avoid pic->pie_read = 0 when + * PAGE_IDLE_KBUF_FULL happened. + */ + if (pic->pie_read_max <= sizeof(uint64_t) + 2) + return PAGE_IDLE_KBUF_FULL; + + memset(pic->kpie, 0, sizeof(pic->kpie)); + return 0; +} + +static void setup_page_idle_ctrl(struct page_idle_ctrl *pic, void *buf, + int buf_size, unsigned int flags) +{ + pic->buf = buf; + pic->buf_size = buf_size; + pic->bytes_copied = 0; + pic->next_hva = 0; + pic->gpa_to_hva = 0; + pic->restart_gpa = 0; + pic->last_va = 0; + pic->flags = flags; +} + +static int page_idle_copy_user(struct page_idle_ctrl *pic, + unsigned long start, unsigned long end) +{ + int bytes_read; + int ret; + + dump_pic(pic); + + bytes_read = pic->pie_read; + if (!bytes_read) + return 0; + + ret = copy_to_user(pic->buf, pic->kpie, bytes_read); + if (ret) + return -EFAULT; + + pic->buf += bytes_read; + pic->bytes_copied += bytes_read; + if (pic->bytes_copied >= pic->buf_size) + return PAGE_IDLE_BUF_FULL; + + ret = init_page_idle_ctrl_buffer(pic); + if (ret) + return ret; + + cond_resched(); + return 0; +} + +#ifdef CONFIG_X86_64 +static int vm_walk_host_range(unsigned long long start, + unsigned long end, + struct mm_walk *walk) +{ + int ret; + struct page_idle_ctrl *pic = walk->private; + unsigned long tmp_gpa_to_hva = pic->gpa_to_hva; + + pic->gpa_to_hva = 0; + +#ifdef KVM_HAVE_MMU_RWLOCK + write_unlock_irq(&pic->kvm->mmu_lock); +#else + spin_unlock_irq(&pic->kvm->mmu_lock); +#endif + down_read(&walk->mm->mmap_lock); + local_irq_disable(); + ret = walk_page_range(walk->mm, start + tmp_gpa_to_hva, end + tmp_gpa_to_hva, + walk->ops, walk->private); + local_irq_enable(); + up_read(&walk->mm->mmap_lock); + pic->gpa_to_hva = tmp_gpa_to_hva; + if (pic->flags & VM_SCAN_HOST) { + pic->restart_gpa -= tmp_gpa_to_hva; + pic->flags &= ~VM_SCAN_HOST; + } + if (ret != PAGE_IDLE_KBUF_FULL && end > pic->restart_gpa) + pic->restart_gpa = end; + + /* ept page table may change after spin_unlock, rescan vm from root ept */ + ret |= RET_RESCAN_FLAG; + + return ret; +} + +static int ept_pte_range(struct page_idle_ctrl *pic, + pmd_t *pmd, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pte_t *pte; + enum ProcIdlePageType page_type; + int err = 0; + + pte = pte_offset_kernel(pmd, addr); + do { + if (KVM_CHECK_INVALID_SPTE(pte->pte)) { + page_type = PTE_IDLE; + } else if (!ept_pte_present(*pte)) { + err = vm_walk_host_range(addr, end, walk); + goto next; + } else if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED, + (unsigned long *) &pte->pte)) + page_type = PTE_IDLE; + else { + page_type = PTE_ACCESSED; + if (pic->flags & SCAN_DIRTY_PAGE) { + if (test_and_clear_bit(_PAGE_BIT_EPT_DIRTY, + (unsigned long *) &pte->pte)) + page_type = PTE_DIRTY_M; + } + } + + err = pic_add_page(pic, addr, addr + PAGE_SIZE, page_type); +next: + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != end); + + return err; +} + +static enum ProcIdlePageType ept_huge_accessed(pmd_t *pmd, unsigned long addr, + unsigned long end) +{ + int accessed = PMD_IDLE; + pte_t *pte; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!KVM_CHECK_INVALID_SPTE(pte->pte)) + continue; + if (!ept_pte_present(*pte)) + continue; + if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED, + (unsigned long *)&pte->pte)) + continue; + accessed = PMD_ACCESSED; + } while (pte++, addr += PAGE_SIZE, addr != end); + + return accessed; +} + +static int ept_pmd_range(struct page_idle_ctrl *pic, + pud_t *pud, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pmd_t *pmd; + unsigned long next; + enum ProcIdlePageType page_type; + enum ProcIdlePageType pte_page_type; + int err = 0; + + if (pic->flags & SCAN_HUGE_PAGE) + pte_page_type = PMD_IDLE_PTES; + else + pte_page_type = IDLE_PAGE_TYPE_MAX; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (KVM_CHECK_INVALID_SPTE(pmd->pmd)) + page_type = PMD_IDLE; + else if (!ept_pmd_present(*pmd)) { + err = vm_walk_host_range(addr, next, walk); + goto next; + } else if (!pmd_large(*pmd)) { + if (pic->flags & SCAN_AS_HUGE) + page_type = ept_huge_accessed(pmd, addr, next); + else + page_type = pte_page_type; + } else if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED, + (unsigned long *)pmd)) + page_type = PMD_IDLE; + else { + page_type = PMD_ACCESSED; + if ((pic->flags & SCAN_DIRTY_PAGE) && + test_and_clear_bit(_PAGE_BIT_EPT_DIRTY, + (unsigned long *) pmd)) + page_type = PMD_DIRTY_M; + } + + if (page_type != IDLE_PAGE_TYPE_MAX) + err = pic_add_page(pic, addr, next, page_type); + else + err = ept_pte_range(pic, pmd, addr, next, walk); + +next: + if (err) + break; + } while (pmd++, addr = next, addr != end); + + return err; +} + + +static int ept_pud_range(struct page_idle_ctrl *pic, + p4d_t *p4d, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pud_t *pud; + unsigned long next; + int err = 0; + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + + if (!ept_pud_present(*pud)) { + err = vm_walk_host_range(addr, next, walk); + goto next; + } + + if (pud_large(*pud)) + err = pic_add_page(pic, addr, next, PUD_PRESENT); + else + err = ept_pmd_range(pic, pud, addr, next, walk); + +next: + if (err) + break; + } while (pud++, addr = next, addr != end); + + return err; +} + +static int ept_p4d_range(struct page_idle_ctrl *pic, + p4d_t *p4d, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + unsigned long next; + int err = 0; + + p4d += p4d_index(addr); + do { + next = p4d_addr_end(addr, end); + if (!ept_p4d_present(*p4d)) { + set_restart_gpa(next, "P4D_HOLE"); + continue; + } + + err = ept_pud_range(pic, p4d, addr, next, walk); + if (err) + break; + } while (p4d++, addr = next, addr != end); + + return err; +} + +static int ept_pgd_range(struct page_idle_ctrl *pic, + pgd_t *pgd, + unsigned long addr, + unsigned long end, + struct mm_walk *walk) +{ + p4d_t *p4d; + unsigned long next; + int err = 0; + + pgd = pgd_offset_pgd(pgd, addr); + do { + next = pgd_addr_end(addr, end); + if (!ept_pgd_present(*pgd)) { + set_restart_gpa(next, "PGD_HOLE"); + continue; + } + + p4d = (p4d_t *)pgd_page_vaddr(*pgd); + err = ept_p4d_range(pic, p4d, addr, next, walk); + if (err) + break; + } while (pgd++, addr = next, addr != end); + + return err; +} + +static int ept_page_range(struct page_idle_ctrl *pic, + unsigned long addr, + unsigned long end, + struct mm_walk *walk) +{ + struct kvm_vcpu *vcpu; + struct kvm_mmu *mmu; + uint64_t *ept_root; + int err = 0; + + WARN_ON(addr >= end); + +#ifdef KVM_HAVE_MMU_RWLOCK + write_lock_irq(&pic->kvm->mmu_lock); +#else + spin_lock_irq(&pic->kvm->mmu_lock); +#endif + + vcpu = kvm_get_vcpu(pic->kvm, 0); + if (!vcpu) { + pic->gpa_to_hva = 0; + set_restart_gpa(TASK_SIZE, "NO-VCPU"); +#ifdef KVM_HAVE_MMU_RWLOCK + write_unlock_irq(&pic->kvm->mmu_lock); +#else + spin_unlock_irq(&pic->kvm->mmu_lock); +#endif + return -EINVAL; + } + + mmu = kvm_arch_mmu_pointer(vcpu); + if (!VALID_PAGE(mmu->root_hpa)) { + pic->gpa_to_hva = 0; + set_restart_gpa(TASK_SIZE, "NO-HPA"); +#ifdef KVM_HAVE_MMU_RWLOCK + write_unlock_irq(&pic->kvm->mmu_lock); +#else + spin_unlock_irq(&pic->kvm->mmu_lock); +#endif + return -EINVAL; + } + + ept_root = __va(mmu->root_hpa); + + /* Walk start at p4d when vm has 4 level table pages */ + if (mmu->shadow_root_level != 4) + err = ept_pgd_range(pic, (pgd_t *)ept_root, addr, end, walk); + else + err = ept_p4d_range(pic, (p4d_t *)ept_root, addr, end, walk); + + /* mmu_lock is unlock in vm_walk_host_range which will unlock mmu_lock + * and RET_RESCAN_FLAG will be set in ret value + */ + if (!(err & RET_RESCAN_FLAG)) +#ifdef KVM_HAVE_MMU_RWLOCK + write_unlock_irq(&pic->kvm->mmu_lock); +#else + spin_unlock_irq(&pic->kvm->mmu_lock); +#endif + else + err &= ~RET_RESCAN_FLAG; + + return err; +} + +static int ept_idle_supports_cpu(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + struct kvm_mmu *mmu; + int ret; + + vcpu = kvm_get_vcpu(kvm, 0); + if (!vcpu) + return -EINVAL; + +#ifdef KVM_HAVE_MMU_RWLOCK + write_lock(&kvm->mmu_lock); +#else + spin_lock(&kvm->mmu_lock); +#endif + mmu = kvm_arch_mmu_pointer(vcpu); + if (kvm_mmu_ad_disabled(mmu)) { + pr_notice("CPU does not support EPT A/D bits tracking\n"); + ret = -EINVAL; + } else if (mmu->shadow_root_level < 4 || + (mmu->shadow_root_level == 5 && !pgtable_l5_enabled())) { + pr_notice("Unsupported EPT level %d\n", mmu->root_role.level); + ret = -EINVAL; + } else + ret = 0; +#ifdef KVM_HAVE_MMU_RWLOCK + write_unlock(&kvm->mmu_lock); +#else + spin_unlock(&kvm->mmu_lock); +#endif + + return ret; +} + +#else +static int arm_pte_range(struct page_idle_ctrl *pic, + pmd_t *pmd, unsigned long addr, unsigned long end) +{ + pte_t *pte; + enum ProcIdlePageType page_type; + int err = 0; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_present(*pte)) + page_type = PTE_HOLE; + else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, + (unsigned long *) &pte->pte)) + page_type = PTE_IDLE; + else + page_type = PTE_ACCESSED; + + err = pic_add_page(pic, addr, addr + PAGE_SIZE, page_type); + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != end); + + return err; +} + +static int arm_pmd_range(struct page_idle_ctrl *pic, + pud_t *pud, unsigned long addr, unsigned long end) +{ + pmd_t *pmd; + unsigned long next; + enum ProcIdlePageType page_type; + enum ProcIdlePageType pte_page_type; + int err = 0; + + if (pic->flags & SCAN_HUGE_PAGE) + pte_page_type = PMD_IDLE_PTES; + else + pte_page_type = IDLE_PAGE_TYPE_MAX; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (!pmd_present(*pmd)) + page_type = PMD_HOLE; + else if (!if_pmd_thp_or_huge(*pmd)) + page_type = pte_page_type; + else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, + (unsigned long *)pmd)) + page_type = PMD_IDLE; + else + page_type = PMD_ACCESSED; + + if (page_type != IDLE_PAGE_TYPE_MAX) + err = pic_add_page(pic, addr, next, page_type); + else + err = arm_pte_range(pic, pmd, addr, next); + if (err) + break; + } while (pmd++, addr = next, addr != end); + + return err; +} + +static int arm_pud_range(struct page_idle_ctrl *pic, + p4d_t *p4d, unsigned long addr, unsigned long end) +{ + pud_t *pud; + unsigned long next; + int err = 0; + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + if (!pud_present(*pud)) { + set_restart_gpa(next, "PUD_HOLE"); + continue; + } + + if (if_pud_huge(*pud)) + err = pic_add_page(pic, addr, next, PUD_PRESENT); + else + err = arm_pmd_range(pic, pud, addr, next); + if (err) + break; + } while (pud++, addr = next, addr != end); + + return err; +} + +static int arm_p4d_range(struct page_idle_ctrl *pic, + pgd_t *pgd, unsigned long addr, unsigned long end) +{ + p4d_t *p4d; + unsigned long next; + int err = 0; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (!p4d_present(*p4d)) { + set_restart_gpa(next, "P4D_HOLE"); + continue; + } + + err = arm_pud_range(pic, p4d, addr, next); + if (err) + break; + } while (p4d++, addr = next, addr != end); + + return err; +} + +static int arm_page_range(struct page_idle_ctrl *pic, + unsigned long addr, + unsigned long end) +{ + pgd_t *pgd; + unsigned long next; + struct kvm *kvm = pic->kvm; + int err = 0; + + WARN_ON(addr >= end); + +#ifdef KVM_HAVE_MMU_RWLOCK + write_lock(&pic->kvm->mmu_lock); +#else + spin_lock(&pic->kvm->mmu_lock); +#endif + pgd = (pgd_t *)kvm->arch.mmu.pgt->pgd + pgd_index(addr); +#ifdef KVM_HAVE_MMU_RWLOCK + write_unlock(&pic->kvm->mmu_lock); +#else + spin_unlock(&pic->kvm->mmu_lock); +#endif + + local_irq_disable(); + do { + next = stage2_range_addr_end(addr, end); + if (!pgd_present(*pgd)) { + set_restart_gpa(next, "PGD_HOLE"); + continue; + } + + err = arm_p4d_range(pic, pgd, addr, next); + if (err) + break; + } while (pgd++, addr = next, addr != end); + + local_irq_enable(); + return err; +} +#endif + +/* + * Depending on whether hva falls in a memslot: + * + * 1) found => return gpa and remaining memslot size in *addr_range + * + * |<----- addr_range --------->| + * [ mem slot ] + * ^hva + * + * 2) not found => return hole size in *addr_range + * + * |<----- addr_range --------->| + * [first mem slot above hva ] + * ^hva + * + * If hva is above all mem slots, *addr_range will be ~0UL. + * We can finish read(2). + */ +static unsigned long vm_idle_find_gpa(struct page_idle_ctrl *pic, + unsigned long hva, + unsigned long *addr_range) +{ + struct kvm *kvm = pic->kvm; + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + unsigned long hva_end; + int bkt; + gfn_t gfn; + + *addr_range = ~0UL; + mutex_lock(&kvm->slots_lock); + slots = kvm_memslots(pic->kvm); + kvm_for_each_memslot(memslot, bkt, slots) { + hva_end = memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT); + + if (hva >= memslot->userspace_addr && hva < hva_end) { + gpa_t gpa; + + gfn = hva_to_gfn_memslot(hva, memslot); + *addr_range = hva_end - hva; + gpa = gfn_to_gpa(gfn); + mutex_unlock(&kvm->slots_lock); + return gpa; + } + + if (memslot->userspace_addr > hva) + *addr_range = min(*addr_range, + memslot->userspace_addr - hva); + } + mutex_unlock(&kvm->slots_lock); + return INVALID_PAGE; +} + +static inline unsigned long mask_to_size(unsigned long mask) +{ + return ~mask + 1; +} + +static int mm_idle_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk); +static int vm_idle_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + enum ProcIdlePageType page_type; + + pic->flags |= VM_SCAN_HOST; + + /* hugetlb page table entry of vm maybe not present while page is resident + * in address_space + */ + if (mask_to_size(hmask) != PUD_SIZE && !pte_present(*pte) && + vm_handle_pte_hole != NULL) { + page_type = vm_handle_pte_hole(addr, next, -1, walk); + if (page_type < IDLE_PAGE_TYPE_MAX) + return pic_add_page(pic, addr, next, page_type); + } + + return mm_idle_hugetlb_entry(pte, hmask, addr, next, walk); +} + +static int vm_idle_pte_hole(unsigned long addr, unsigned long next, int depth, struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + enum ProcIdlePageType pagetype; + + if (vm_handle_pte_hole == NULL) + return 0; + + pagetype = vm_handle_pte_hole(addr, next, depth, walk); + if (pagetype >= IDLE_PAGE_TYPE_MAX) + return 0; + + debug_printk("scan pte hole addr %pK type %d\n", addr, pagetype); + pic->flags |= VM_SCAN_HOST; + return pic_add_page(pic, addr, next, pagetype); +} + +static int mm_idle_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk); +static int vm_idle_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + + pic->flags |= VM_SCAN_HOST; + return mm_idle_pmd_entry(pmd, addr, next, walk); +} + +static int mm_idle_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk); +static int vm_idle_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + + pic->flags |= VM_SCAN_HOST; + return mm_idle_pud_entry(pud, addr, next, walk); +} + +static int vm_idle_walk_hva_range(struct page_idle_ctrl *pic, + unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + unsigned long gpa_addr; + unsigned long gpa_next; + unsigned long gpa_end; + unsigned long addr_range; + unsigned long va_end; + int ret; + int steps; + +#ifdef CONFIG_X86_64 + ret = ept_idle_supports_cpu(pic->kvm); + if (ret) + return ret; +#endif + + ret = init_page_idle_ctrl_buffer(pic); + if (ret) + return ret; + + for (; start < end;) { + gpa_addr = vm_idle_find_gpa(pic, start, &addr_range); + + if (gpa_addr == INVALID_PAGE) { + pic->gpa_to_hva = 0; + if (addr_range == ~0UL) { + set_restart_gpa(TASK_SIZE, "EOF"); + va_end = end; + } else { + start += addr_range; + set_restart_gpa(start, "OUT-OF-SLOT"); + va_end = start; + } + } else { + pic->gpa_to_hva = start - gpa_addr; + gpa_end = gpa_addr + addr_range; + steps = 0; + for (; gpa_addr < gpa_end;) { + gpa_next = min(gpa_end, gpa_addr + walk_step * PAGE_SIZE); +#ifdef CONFIG_ARM64 + ret = arm_page_range(pic, gpa_addr, gpa_next); +#else + ret = ept_page_range(pic, gpa_addr, gpa_next, walk); +#endif + gpa_addr = pic->restart_gpa; + + if (ret) + break; + + if (++steps >= resched_step) { + cond_resched(); + steps = 0; + } + } + va_end = pic->gpa_to_hva + gpa_end; + } + + start = pic->restart_gpa + pic->gpa_to_hva; + ret = page_idle_copy_user(pic, start, va_end); + if (ret) + break; + } + + if (start > pic->next_hva) + set_next_hva(start, "NEXT-START"); + + if (pic->bytes_copied) + ret = 0; + return ret; +} + +static int mm_idle_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk); +static ssize_t vm_idle_read(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + struct mm_struct *mm = file->private_data; + struct mm_walk mm_walk = {}; + struct mm_walk_ops mm_walk_ops = {}; + struct page_idle_ctrl *pic; + unsigned long hva_start = *ppos; + unsigned long hva_end = hva_start + (count << (3 + PAGE_SHIFT)); + int ret; + + pic = kzalloc(sizeof(*pic), GFP_KERNEL); + if (!pic) + return -ENOMEM; + + setup_page_idle_ctrl(pic, buf, count, file->f_flags); + pic->kvm = mm_kvm(mm); + + mm_walk_ops.pmd_entry = vm_idle_pmd_entry; + mm_walk_ops.pud_entry = vm_idle_pud_entry; + mm_walk_ops.hugetlb_entry = vm_idle_hugetlb_entry; + mm_walk_ops.pte_hole = vm_idle_pte_hole; + mm_walk_ops.test_walk = mm_idle_test_walk; + + mm_walk.mm = mm; + mm_walk.ops = &mm_walk_ops; + mm_walk.private = pic; + + ret = vm_idle_walk_hva_range(pic, hva_start, hva_end, &mm_walk); + if (ret) + goto out_kvm; + + ret = pic->bytes_copied; + *ppos = pic->next_hva; +out_kvm: + kfree(pic); + return ret; + +} + +static ssize_t mm_idle_read(struct file *file, char *buf, + size_t count, loff_t *ppos); + +static ssize_t page_scan_read(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + struct mm_struct *mm = file->private_data; + unsigned long hva_start = *ppos; + unsigned long hva_end = hva_start + (count << (3 + PAGE_SHIFT)); + + if ((hva_start >= TASK_SIZE) || (hva_end >= TASK_SIZE)) { + debug_printk("page_idle_read past TASK_SIZE: %pK %pK %lx\n", + hva_start, hva_end, TASK_SIZE); + return 0; + } + if (hva_end <= hva_start) { + debug_printk("page_idle_read past EOF: %pK %pK\n", + hva_start, hva_end); + return 0; + } + if (*ppos & (PAGE_SIZE - 1)) { + debug_printk("page_idle_read unaligned ppos: %pK\n", + hva_start); + return -EINVAL; + } + if (count < PAGE_IDLE_BUF_MIN) { + debug_printk("page_idle_read small count: %lx\n", + (unsigned long)count); + return -EINVAL; + } + + if (!mm_kvm(mm)) + return mm_idle_read(file, buf, count, ppos); + + return vm_idle_read(file, buf, count, ppos); +} + +static int page_scan_open(struct inode *inode, struct file *file) +{ + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + + return 0; +} + +static int page_scan_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + struct kvm *kvm; + int ret = 0; + + if (!mm) { + ret = -EBADF; + goto out; + } + + kvm = mm_kvm(mm); + if (!kvm) { + ret = -EINVAL; + goto out; + } +#ifdef CONFIG_X86_64 +#ifdef KVM_HAVE_MMU_RWLOCK + write_lock(&kvm->mmu_lock); +#else + spin_lock(&kvm->mmu_lock); +#endif + + kvm_flush_remote_tlbs(kvm); +#ifdef KVM_HAVE_MMU_RWLOCK + write_unlock(&kvm->mmu_lock); +#else + spin_unlock(&kvm->mmu_lock); +#endif +#endif + +out: + module_put(THIS_MODULE); + return ret; +} + +static int mm_idle_pmd_large(pmd_t pmd) +{ +#ifdef CONFIG_ARM64 + return if_pmd_thp_or_huge(pmd); +#else + return pmd_large(pmd); +#endif +} + +static int mm_idle_pte_range(struct page_idle_ctrl *pic, pmd_t *pmd, + unsigned long addr, unsigned long next) +{ + enum ProcIdlePageType page_type; + pte_t *pte; + int err = 0; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_present(*pte)) + page_type = PTE_HOLE; + else if (pic->flags & SCAN_IGN_HOST) + page_type = PTE_IDLE; + else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, + (unsigned long *) &pte->pte)) + page_type = PTE_IDLE; + else { + page_type = PTE_ACCESSED; + } + + err = pic_add_page(pic, addr, addr + PAGE_SIZE, page_type); + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != next); + + return err; +} + +static int mm_idle_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + enum ProcIdlePageType page_type; + unsigned long start = addr & hmask; /* hugepage may be splited in vm */ + int ret; + + if (mask_to_size(hmask) == PUD_SIZE) { + page_type = PUD_PRESENT; + goto add_page; + } + + if (!pte_present(*pte)) + page_type = PMD_HOLE; + else if (pic->flags & SCAN_IGN_HOST) + page_type = PMD_IDLE; + else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, (unsigned long *)pte)) + page_type = PMD_IDLE; + else + page_type = PMD_ACCESSED; + +add_page: + ret = pic_add_page(pic, start, start + pagetype_size[page_type], page_type); + return ret; +} + +static int mm_idle_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + enum ProcIdlePageType page_type; + enum ProcIdlePageType pte_page_type; + int err; + + /* + * Skip duplicate PMD_IDLE_PTES: when the PMD crosses VMA boundary, + * walk_page_range() can call on the same PMD twice. + */ + if ((addr & PMD_MASK) == (pic->last_va & PMD_MASK) && (pic->flags & SCAN_HUGE_PAGE)) { + debug_printk("ignore duplicate addr %pK %pK\n", + addr, pic->last_va); + set_restart_gpa(round_up(next, PMD_SIZE), "DUP_ADDR"); + return 0; + } + pic->last_va = addr; + + if (pic->flags & SCAN_HUGE_PAGE) + pte_page_type = PMD_IDLE_PTES; + else + pte_page_type = IDLE_PAGE_TYPE_MAX; + + if (!pmd_present(*pmd)) + page_type = PMD_HOLE; + else if (!mm_idle_pmd_large(*pmd)) + page_type = pte_page_type; + else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, + (unsigned long *)pmd) || + pic->flags & SCAN_IGN_HOST) + page_type = PMD_IDLE; + else + page_type = PMD_ACCESSED; + + if (page_type != IDLE_PAGE_TYPE_MAX) + err = pic_add_page(pic, addr, next, page_type); + else + err = mm_idle_pte_range(pic, pmd, addr, next); + + return err; +} + +static int mm_idle_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + + spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma); + + if (ptl) { + if ((addr & PUD_MASK) != (pic->last_va & PUD_MASK)) { + pic_add_page(pic, addr, next, PUD_PRESENT); + pic->last_va = addr; + } + spin_unlock(ptl); + return 1; + } + + return 0; +} + +static int mm_idle_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + struct page_idle_ctrl *pic = walk->private; + + /* If the specified page swapout is set, the untagged vma is skipped. */ + if ((pic->flags & VMA_SCAN_FLAG) && !(vma->vm_flags & VM_SWAPFLAG)) + return 1; + + if (vma->vm_file) { + if (is_vm_hugetlb_page(vma)) + return 0; + if ((vma->vm_flags & (VM_WRITE|VM_MAYSHARE)) == VM_WRITE) + return 0; + return 1; + } + + return 0; +} + +static int mm_idle_walk_range(struct page_idle_ctrl *pic, + unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma; + int ret = 0; + + ret = init_page_idle_ctrl_buffer(pic); + if (ret) + return ret; + + for (; start < end;) { + down_read(&walk->mm->mmap_lock); + vma = find_vma(walk->mm, start); + if (vma) { + if (end > vma->vm_start) { + local_irq_disable(); + ret = walk_page_range(walk->mm, start, end, + walk->ops, walk->private); + local_irq_enable(); + } else + set_restart_gpa(vma->vm_start, "VMA-HOLE"); + } else + set_restart_gpa(TASK_SIZE, "EOF"); + up_read(&walk->mm->mmap_lock); + + WARN_ONCE(pic->gpa_to_hva, "non-zero gpa_to_hva"); + if (ret != PAGE_IDLE_KBUF_FULL && end > pic->restart_gpa) + pic->restart_gpa = end; + start = pic->restart_gpa; + ret = page_idle_copy_user(pic, start, end); + if (ret) + break; + } + + if (start > pic->next_hva) + set_next_hva(start, "NEXT-START"); + + if (pic->bytes_copied) { + if (ret != PAGE_IDLE_BUF_FULL && pic->next_hva < end) + debug_printk("partial scan: next_hva=%pK end=%pK\n", + pic->next_hva, end); + ret = 0; + } else + debug_printk("nothing read"); + return ret; +} + +static ssize_t mm_idle_read(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + struct mm_struct *mm = file->private_data; + struct mm_walk_ops *mm_walk_ops = NULL; + struct mm_walk mm_walk = {}; + struct page_idle_ctrl *pic; + unsigned long va_start = *ppos; + unsigned long va_end = va_start + (count << (3 + PAGE_SHIFT)); + int ret; + + if (va_end <= va_start) { + debug_printk("%s past EOF: %pK %pK\n", + __func__, va_start, va_end); + return 0; + } + if (*ppos & (PAGE_SIZE - 1)) { + debug_printk("%s unaligned ppos: %pK\n", + __func__, va_start); + return -EINVAL; + } + if (count < PAGE_IDLE_BUF_MIN) { + debug_printk("%s small count: %lx\n", + __func__, (unsigned long)count); + return -EINVAL; + } + + pic = kzalloc(sizeof(*pic), GFP_KERNEL); + if (!pic) + return -ENOMEM; + + mm_walk_ops = kzalloc(sizeof(struct mm_walk_ops), GFP_KERNEL); + if (!mm_walk_ops) { + kfree(pic); + return -ENOMEM; + } + + setup_page_idle_ctrl(pic, buf, count, file->f_flags); + + mm_walk_ops->pmd_entry = mm_idle_pmd_entry; + mm_walk_ops->pud_entry = mm_idle_pud_entry; + mm_walk_ops->hugetlb_entry = mm_idle_hugetlb_entry; + mm_walk_ops->test_walk = mm_idle_test_walk; + + mm_walk.mm = mm; + mm_walk.ops = mm_walk_ops; + mm_walk.private = pic; + mm_walk.pgd = NULL; + mm_walk.no_vma = false; + ret = mm_idle_walk_range(pic, va_start, va_end, &mm_walk); + if (ret) + goto out_free; + + ret = pic->bytes_copied; + *ppos = pic->next_hva; +out_free: + kfree(pic); + kfree(mm_walk_ops); + return ret; +} + +static long page_scan_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + unsigned int flags; + + if (get_user(flags, (unsigned int __user *)argp)) + return -EFAULT; + flags &= ALL_SCAN_FLAGS; + + switch (cmd) { + case IDLE_SCAN_ADD_FLAGS: + filp->f_flags |= flags; + break; + case IDLE_SCAN_REMOVE_FLAGS: + filp->f_flags &= ~flags; + break; + case VMA_SCAN_ADD_FLAGS: + filp->f_flags |= flags; + break; + case VMA_SCAN_REMOVE_FLAGS: + filp->f_flags &= ~flags; + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +extern struct file_operations proc_page_scan_operations; + +static int page_scan_entry(void) +{ + proc_page_scan_operations.flock(NULL, 1, NULL); + proc_page_scan_operations.owner = THIS_MODULE; + proc_page_scan_operations.read = page_scan_read; + proc_page_scan_operations.open = page_scan_open; + proc_page_scan_operations.release = page_scan_release; + proc_page_scan_operations.unlocked_ioctl = page_scan_ioctl; + proc_page_scan_operations.flock(NULL, 0, NULL); + + return 0; +} + +static void page_scan_exit(void) +{ + proc_page_scan_operations.flock(NULL, 1, NULL); + proc_page_scan_operations.owner = NULL; + proc_page_scan_operations.read = NULL; + proc_page_scan_operations.open = NULL; + proc_page_scan_operations.release = NULL; + proc_page_scan_operations.unlocked_ioctl = NULL; + proc_page_scan_operations.flock(NULL, 0, NULL); +} + +MODULE_LICENSE("GPL"); +module_init(page_scan_entry); +module_exit(page_scan_exit); diff --git a/fs/proc/etmem_scan.h b/fs/proc/etmem_scan.h new file mode 100644 index 000000000000..e109f7f350e1 --- /dev/null +++ b/fs/proc/etmem_scan.h @@ -0,0 +1,149 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _PAGE_IDLE_H +#define _PAGE_IDLE_H + +#include <linux/types.h> + +#define SCAN_HUGE_PAGE O_NONBLOCK /* only huge page */ +#define SCAN_SKIM_IDLE O_NOFOLLOW /* stop on PMD_IDLE_PTES */ +#define SCAN_DIRTY_PAGE O_NOATIME /* report pte/pmd dirty bit */ + +/* define to not used file flags */ +#define SCAN_AS_HUGE 0100000000 /* treat normal page as hugepage in vm */ +#define SCAN_IGN_HOST 0200000000 /* ignore host access when scan vm */ +#define VM_SCAN_HOST 0400000000 /* scan and add host page for vm hole(internal) */ +#define VMA_SCAN_FLAG 0x1000 /* scan the specifics vma with flag */ + +#define ALL_SCAN_FLAGS (SCAN_HUGE_PAGE | SCAN_SKIM_IDLE | SCAN_DIRTY_PAGE | \ + SCAN_AS_HUGE | SCAN_IGN_HOST | VM_SCAN_HOST | VMA_SCAN_FLAG) + +#define IDLE_SCAN_MAGIC 0x66 +#define IDLE_SCAN_ADD_FLAGS _IOW(IDLE_SCAN_MAGIC, 0x0, unsigned int) +#define IDLE_SCAN_REMOVE_FLAGS _IOW(IDLE_SCAN_MAGIC, 0x1, unsigned int) +#define VMA_SCAN_ADD_FLAGS _IOW(IDLE_SCAN_MAGIC, 0x2, unsigned int) +#define VMA_SCAN_REMOVE_FLAGS _IOW(IDLE_SCAN_MAGIC, 0x3, unsigned int) + +enum ProcIdlePageType { + PTE_ACCESSED, /* 4k page */ + PMD_ACCESSED, /* 2M page */ + PUD_PRESENT, /* 1G page */ + + PTE_DIRTY_M, + PMD_DIRTY_M, + + PTE_IDLE, + PMD_IDLE, + PMD_IDLE_PTES, /* all PTE idle */ + + PTE_HOLE, + PMD_HOLE, + + PIP_CMD, + + IDLE_PAGE_TYPE_MAX +}; + +#define PIP_TYPE(a) (0xf & (a >> 4)) +#define PIP_SIZE(a) (0xf & a) +#define PIP_COMPOSE(type, nr) ((type << 4) | nr) + +#define PIP_CMD_SET_HVA PIP_COMPOSE(PIP_CMD, 0) + +#ifndef INVALID_PAGE +#define INVALID_PAGE ~0UL +#endif + +#ifdef CONFIG_ARM64 +#define _PAGE_MM_BIT_ACCESSED 10 +#else +#define _PAGE_MM_BIT_ACCESSED _PAGE_BIT_ACCESSED +#endif + +#ifdef CONFIG_X86_64 +#define _PAGE_BIT_EPT_ACCESSED 8 +#define _PAGE_BIT_EPT_DIRTY 9 +#define _PAGE_EPT_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_EPT_ACCESSED) +#define _PAGE_EPT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_EPT_DIRTY) + +#define _PAGE_EPT_PRESENT (_AT(pteval_t, 7)) + +static inline int ept_pte_present(pte_t a) +{ + return pte_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_pmd_present(pmd_t a) +{ + return pmd_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_pud_present(pud_t a) +{ + return pud_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_p4d_present(p4d_t a) +{ + return p4d_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_pgd_present(pgd_t a) +{ + return pgd_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_pte_accessed(pte_t a) +{ + return pte_flags(a) & _PAGE_EPT_ACCESSED; +} + +static inline int ept_pmd_accessed(pmd_t a) +{ + return pmd_flags(a) & _PAGE_EPT_ACCESSED; +} + +static inline int ept_pud_accessed(pud_t a) +{ + return pud_flags(a) & _PAGE_EPT_ACCESSED; +} + +static inline int ept_p4d_accessed(p4d_t a) +{ + return p4d_flags(a) & _PAGE_EPT_ACCESSED; +} + +static inline int ept_pgd_accessed(pgd_t a) +{ + return pgd_flags(a) & _PAGE_EPT_ACCESSED; +} +#endif + +extern struct file_operations proc_page_scan_operations; + +#define PAGE_IDLE_KBUF_FULL 1 +#define PAGE_IDLE_BUF_FULL 2 +#define PAGE_IDLE_BUF_MIN (sizeof(uint64_t) * 2 + 3) + +#define PAGE_IDLE_KBUF_SIZE 8000 + +struct page_idle_ctrl { + struct mm_struct *mm; + struct kvm *kvm; + + uint8_t kpie[PAGE_IDLE_KBUF_SIZE]; + int pie_read; + int pie_read_max; + + void __user *buf; + int buf_size; + int bytes_copied; + + unsigned long next_hva; /* GPA for EPT; VA for PT */ + unsigned long gpa_to_hva; + unsigned long restart_gpa; + unsigned long last_va; + + unsigned int flags; +}; + +#endif diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 9a8f32f21ff5..a9615455b709 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -303,6 +303,9 @@ extern const struct file_operations proc_pid_smaps_operations; extern const struct file_operations proc_pid_smaps_rollup_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; +#ifdef CONFIG_ETMEM +extern const struct file_operations proc_mm_idle_operations; +#endif
extern unsigned long task_vsize(struct mm_struct *); extern unsigned long task_statm(struct mm_struct *, diff --git a/include/linux/etmem.h b/include/linux/etmem.h new file mode 100644 index 000000000000..e8a2585f3891 --- /dev/null +++ b/include/linux/etmem.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __MM_ETMEM_H_ +#define __MM_ETMEM_H_ + +#include <linux/list.h> +#include <asm/page.h> +#include <linux/mmzone.h> +#include <linux/memcontrol.h> +#include <linux/page-flags.h> + +#ifdef CONFIG_ETMEM + +#if IS_ENABLED(CONFIG_KVM) +static inline struct kvm *mm_kvm(struct mm_struct *mm) +{ + return mm->kvm; +} +#else +static inline struct kvm *mm_kvm(struct mm_struct *mm) +{ + return NULL; +} +#endif + + +#endif /* #ifdef CONFIG_ETMEM */ +#endif /* define __MM_ETMEM_H_ */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 8cf86b56aba5..37b12832e7de 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -308,6 +308,10 @@ extern unsigned int kobjsize(const void *objp); # define VM_SOFTDIRTY 0 #endif
+#ifdef CONFIG_ETMEM +#define VM_SWAPFLAG 0x400000000000000 /* memory swap out flag in vma */ +#endif + #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ #define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 582aa5e44a5a..9226a236b2cb 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -931,6 +931,12 @@ struct mm_struct { #endif } lru_gen; #endif /* CONFIG_LRU_GEN */ +#ifdef CONFIG_ETMEM +#if IS_ENABLED(CONFIG_KVM) && !defined(__GENKSYMS__) + struct kvm *kvm; +#endif +#endif + } __randomize_layout;
/* diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 6ce1f1ceb432..14e5498efd7a 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -79,6 +79,10 @@
#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
+#define MADV_ETMEM_BASE 0x1100 +#define MADV_SWAPFLAG MADV_ETMEM_BASE /* for memory to be swap out */ +#define MADV_SWAPFLAG_REMOVE (MADV_SWAPFLAG + 1) + /* compatibility flags */ #define MAP_FILE 0
diff --git a/mm/Kconfig b/mm/Kconfig index ece4f2847e2b..919f451d683d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -903,6 +903,29 @@ config USE_PERCPU_NUMA_NODE_ID config HAVE_SETUP_PER_CPU_AREA bool
+config ETMEM_SCAN + tristate "module: etmem page scan for etmem support" + depends on ETMEM + help + etmem scan is a critical part of the etmem feature. + A kernel module, etmem_scan.ko periodically scans the appointed vma + segments of the target process, perform page table walk accordingly, + check and clear the access bit of each page before finally report + scan results back to user space. + etmem scan also supports virtual machines. + +config ETMEM + bool "Enable etmem feature" + depends on MMU + depends on X86 || ARM64 + default n + help + etmem is a tiered memory extension technology that uses DRAM and memory + compression/high-performance storage media to form tiered memory storage. + Memory data is tiered, and cold data is migrated from memory media to + high-performance storage media to release memory space and reduce + memory costs. + config CMA bool "Contiguous Memory Allocator" depends on MMU diff --git a/mm/madvise.c b/mm/madvise.c index 4dded5d27e7e..726668a521dc 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1092,6 +1092,14 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, if (error) goto out; break; +#ifdef CONFIG_ETMEM + case MADV_SWAPFLAG: + new_flags |= VM_SWAPFLAG; + break; + case MADV_SWAPFLAG_REMOVE: + new_flags &= ~VM_SWAPFLAG; + break; +#endif case MADV_COLLAPSE: return madvise_collapse(vma, prev, start, end); } @@ -1196,6 +1204,10 @@ madvise_behavior_valid(int behavior) #ifdef CONFIG_MEMORY_FAILURE case MADV_SOFT_OFFLINE: case MADV_HWPOISON: +#endif +#ifdef CONFIG_ETMEM + case MADV_SWAPFLAG: + case MADV_SWAPFLAG_REMOVE: #endif return true;
@@ -1386,6 +1398,10 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, * easily if memory pressure happens. * MADV_PAGEOUT - the application is not expected to use this memory soon, * page out the pages in this range immediately. + * MADV_SWAPFLAG - Used in the etmem memory extension feature, the process + * specifies the memory swap area by adding a flag to a specific + * vma address. + * MADV_SWAPFLAG_REMOVE - remove the specific vma flag * MADV_POPULATE_READ - populate (prefault) page tables readable by * triggering read faults if required * MADV_POPULATE_WRITE - populate (prefault) page tables writable by diff --git a/mm/pagewalk.c b/mm/pagewalk.c index b7d7e4fcfad7..2ac04c1dbb09 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -525,6 +525,9 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, } while (start = next, start < end); return err; } +#ifdef CONFIG_ETMEM +EXPORT_SYMBOL_GPL(walk_page_range); +#endif
/** * walk_page_range_novma - walk a range of pagetables not backed by a vma diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 486800a7024b..30fb090a61d7 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1305,6 +1305,9 @@ static void kvm_destroy_vm(struct kvm *kvm)
kvm_destroy_pm_notifier(kvm); kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); +#if IS_ENABLED(CONFIG_KVM) + mm->kvm = NULL; +#endif kvm_destroy_vm_debugfs(kvm); kvm_arch_sync_events(kvm); mutex_lock(&kvm_lock); @@ -5098,6 +5101,9 @@ static int kvm_dev_ioctl_create_vm(unsigned long type) goto put_kvm; }
+#if IS_ENABLED(CONFIG_KVM) + kvm->mm->kvm = kvm; +#endif /* * Don't call kvm_put_kvm anymore at this point; file->f_op is * already set, with ->release() being kvm_vm_release(). In error