euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7RO5Q Reference: https://gitee.com/openeuler/kernel/commit/c13e5b6a937c6865c461484b07530be3c3...
-------------------------------------------------
reason:This patch is used to add memig scan feature to openEuler system. memig_scan.ko is used to scan the virtual address of the target process and return the address access information to the user mode for grading cold and hot pages.
Signed-off-by: Fengguang Wu fengguang.wu@intel.com Signed-off-by: yanxiaodan yanxiaodan@huawei.com Signed-off-by: Feilong Lin linfeilong@huawei.com Signed-off-by: geruijun geruijun@huawei.com Signed-off-by: liubo liubo254@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + fs/proc/Makefile | 1 + fs/proc/base.c | 2 + fs/proc/internal.h | 1 + fs/proc/memig_scan.c | 1079 ++++++++++++++++++++++++ fs/proc/memig_scan.h | 132 +++ fs/proc/task_mmu.c | 66 ++ include/linux/mm_types.h | 18 +- lib/Kconfig | 6 + mm/huge_memory.c | 1 + mm/pagewalk.c | 1 + virt/kvm/kvm_main.c | 6 + 13 files changed, 1314 insertions(+), 1 deletion(-) create mode 100644 fs/proc/memig_scan.c create mode 100644 fs/proc/memig_scan.h
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 63abdb3f8c63..98bfce36da9f 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -7805,6 +7805,7 @@ CONFIG_FUNCTION_ERROR_INJECTION=y CONFIG_ARCH_HAS_KCOV=y # CONFIG_RUNTIME_TESTING_MENU is not set CONFIG_ARCH_USE_MEMTEST=y +CONFIG_MEMIG_SCAN_MODULE=m # CONFIG_MEMTEST is not set # end of Kernel Testing and Coverage
diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 0e05e7a15fdb..febe4b5d9c0c 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -9043,6 +9043,7 @@ CONFIG_ARCH_HAS_KCOV=y # CONFIG_KCOV is not set # CONFIG_RUNTIME_TESTING_MENU is not set CONFIG_ARCH_USE_MEMTEST=y +CONFIG_MEMIG_SCAN_MODULE=m # CONFIG_MEMTEST is not set # CONFIG_HYPERV_TESTING is not set # end of Kernel Testing and Coverage diff --git a/fs/proc/Makefile b/fs/proc/Makefile index bd08616ed8ba..50c6de6f4979 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -34,3 +34,4 @@ proc-$(CONFIG_PROC_VMCORE) += vmcore.o proc-$(CONFIG_PRINTK) += kmsg.o proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o proc-$(CONFIG_BOOT_CONFIG) += bootconfig.o +obj-$(CONFIG_MEMIG_SCAN_MODULE) += memig_scan.o diff --git a/fs/proc/base.c b/fs/proc/base.c index 05452c3b9872..9c19fca01f69 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3283,6 +3283,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("smaps", S_IRUGO, proc_pid_smaps_operations), REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), + REG("idle_pages", S_IRUSR|S_IWUSR, proc_mm_idle_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), @@ -3631,6 +3632,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("smaps", S_IRUGO, proc_pid_smaps_operations), REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), + REG("idle_pages", S_IRUSR|S_IWUSR, proc_mm_idle_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 9dda7e54b2d0..09867fbcbafc 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -305,6 +305,7 @@ extern const struct file_operations proc_pid_smaps_operations; extern const struct file_operations proc_pid_smaps_rollup_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; +extern const struct file_operations proc_mm_idle_operations;
extern unsigned long task_vsize(struct mm_struct *); extern unsigned long task_statm(struct mm_struct *, diff --git a/fs/proc/memig_scan.c b/fs/proc/memig_scan.c new file mode 100644 index 000000000000..3964e7652127 --- /dev/null +++ b/fs/proc/memig_scan.c @@ -0,0 +1,1079 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/pagemap.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/proc_fs.h> +#include <linux/uaccess.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/bitmap.h> +#include <linux/sched/mm.h> +#include <linux/version.h> +#include <linux/module.h> +#include <linux/io.h> +#include <linux/pagewalk.h> +#include <linux/uaccess.h> +#include <asm/cacheflush.h> +#include <asm/page.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <linux/huge_mm.h> +#ifdef CONFIG_ARM64 +#include <asm/pgtable-types.h> +#include <asm/memory.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_arm.h> +#include <asm/stage2_pgtable.h> +#endif +#include "memig_scan.h" + +#ifdef CONFIG_X86_64 +/* + * Fallback to false for kernel doens't support KVM_INVALID_SPTE + * ept_idle can sitll work in this situation but the scan accuracy may drop, + * depends on the access frequences of the workload. + */ +#ifdef KVM_INVALID_SPTE +#define KVM_CHECK_INVALID_SPTE(val) ((val) == KVM_INVALID_SPTE) +#else +#define KVM_CHECK_INVALID_SPTE(val) (0) +#endif + +# define kvm_arch_mmu_pointer(vcpu) (vcpu->arch.mmu) +# define kvm_mmu_ad_disabled(mmu) (mmu->mmu_role.base.ad_disabled) +#endif /*CONFIG_X86_64*/ + +#ifdef CONFIG_ARM64 +#define if_pmd_thp_or_huge(pmd) (if_pmd_huge(pmd) || pmd_trans_huge(pmd)) +#endif /* CONFIG_ARM64 */ + +#ifdef DEBUG + +#define debug_printk trace_printk + +#define set_restart_gpa(val, note) ({ \ + unsigned long old_val = pic->restart_gpa; \ + pic->restart_gpa = (val); \ + trace_printk("restart_gpa=%lx %luK %s %s %d\n", \ + (val), (pic->restart_gpa - old_val) >> 10, \ + note, __func__, __LINE__); \ +}) + +#define set_next_hva(val, note) ({ \ + unsigned long old_val = pic->next_hva; \ + pic->next_hva = (val); \ + trace_printk(" next_hva=%lx %luK %s %s %d\n", \ + (val), (pic->next_hva - old_val) >> 10, \ + note, __func__, __LINE__); \ +}) + +#else + +#define debug_printk(...) + +#define set_restart_gpa(val, note) ({ \ + pic->restart_gpa = (val); \ +}) + +#define set_next_hva(val, note) ({ \ + pic->next_hva = (val); \ +}) + +#endif + +static unsigned long pagetype_size[16] = { + [PTE_ACCESSED] = PAGE_SIZE, /* 4k page */ + [PMD_ACCESSED] = PMD_SIZE, /* 2M page */ + [PUD_PRESENT] = PUD_SIZE, /* 1G page */ + + [PTE_DIRTY_M] = PAGE_SIZE, + [PMD_DIRTY_M] = PMD_SIZE, + + [PTE_IDLE] = PAGE_SIZE, + [PMD_IDLE] = PMD_SIZE, + [PMD_IDLE_PTES] = PMD_SIZE, + + [PTE_HOLE] = PAGE_SIZE, + [PMD_HOLE] = PMD_SIZE, +}; + +static void u64_to_u8(uint64_t n, uint8_t *p) +{ + p += sizeof(uint64_t) - 1; + + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p = n; +} + +static void dump_pic(struct page_idle_ctrl *pic) +{ + debug_printk("page_idle_ctrl: pie_read=%d pie_read_max=%d", + pic->pie_read, + pic->pie_read_max); + debug_printk(" buf_size=%d bytes_copied=%d next_hva=%pK", + pic->buf_size, + pic->bytes_copied, + pic->next_hva); + debug_printk(" restart_gpa=%pK pa_to_hva=%pK\n", + pic->restart_gpa, + pic->gpa_to_hva); +} + +#ifdef CONFIG_ARM64 +static int if_pmd_huge(pmd_t pmd) +{ + return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT); +} + +static int if_pud_huge(pud_t pud) +{ +#ifndef __PAGETABLE_PMD_FOLDED + return pud_val(pud) && !(pud_val(pud) & PUD_TABLE_BIT); +#else + return 0; +#endif +} +#endif + +static void pic_report_addr(struct page_idle_ctrl *pic, unsigned long addr) +{ + unsigned long hva; + + pic->kpie[pic->pie_read++] = PIP_CMD_SET_HVA; + hva = addr; + u64_to_u8(hva, &pic->kpie[pic->pie_read]); + pic->pie_read += sizeof(uint64_t); + dump_pic(pic); +} + +static int pic_add_page(struct page_idle_ctrl *pic, + unsigned long addr, + unsigned long next, + enum ProcIdlePageType page_type) +{ + unsigned long page_size = pagetype_size[page_type]; + + dump_pic(pic); + + /* align kernel/user vision of cursor position */ + next = round_up(next, page_size); + + if (!pic->pie_read || + addr + pic->gpa_to_hva != pic->next_hva) { + /* merge hole */ + if (page_type == PTE_HOLE || + page_type == PMD_HOLE) { + set_restart_gpa(next, "PTE_HOLE|PMD_HOLE"); + return 0; + } + + if (addr + pic->gpa_to_hva < pic->next_hva) { + debug_printk("page_idle: addr moves backwards\n"); + WARN_ONCE(1, "page_idle: addr moves backwards"); + } + + if (pic->pie_read + sizeof(uint64_t) + 2 >= pic->pie_read_max) { + set_restart_gpa(addr, "PAGE_IDLE_KBUF_FULL"); + return PAGE_IDLE_KBUF_FULL; + } + + pic_report_addr(pic, round_down(addr, page_size) + + pic->gpa_to_hva); + } else { + if (PIP_TYPE(pic->kpie[pic->pie_read - 1]) == page_type && + PIP_SIZE(pic->kpie[pic->pie_read - 1]) < 0xF) { + set_next_hva(next + pic->gpa_to_hva, "IN-PLACE INC"); + set_restart_gpa(next, "IN-PLACE INC"); + pic->kpie[pic->pie_read - 1]++; + WARN_ONCE(page_size < next-addr, "next-addr too large"); + return 0; + } + if (pic->pie_read >= pic->pie_read_max) { + set_restart_gpa(addr, "PAGE_IDLE_KBUF_FULL"); + return PAGE_IDLE_KBUF_FULL; + } + } + + set_next_hva(next + pic->gpa_to_hva, "NEW-ITEM"); + set_restart_gpa(next, "NEW-ITEM"); + pic->kpie[pic->pie_read] = PIP_COMPOSE(page_type, 1); + pic->pie_read++; + + return 0; +} + +static int init_page_idle_ctrl_buffer(struct page_idle_ctrl *pic) +{ + pic->pie_read = 0; + pic->pie_read_max = min(PAGE_IDLE_KBUF_SIZE, + pic->buf_size - pic->bytes_copied); + /* reserve space for PIP_CMD_SET_HVA in the end */ + pic->pie_read_max -= sizeof(uint64_t) + 1; + + /* + * Align with PAGE_IDLE_KBUF_FULL + * logic in pic_add_page(), to avoid pic->pie_read = 0 when + * PAGE_IDLE_KBUF_FULL happened. + */ + if (pic->pie_read_max <= sizeof(uint64_t) + 2) + return PAGE_IDLE_KBUF_FULL; + + memset(pic->kpie, 0, sizeof(pic->kpie)); + return 0; +} + +static void setup_page_idle_ctrl(struct page_idle_ctrl *pic, void *buf, + int buf_size, unsigned int flags) +{ + pic->buf = buf; + pic->buf_size = buf_size; + pic->bytes_copied = 0; + pic->next_hva = 0; + pic->gpa_to_hva = 0; + pic->restart_gpa = 0; + pic->last_va = 0; + pic->flags = flags; +} + +static int page_idle_copy_user(struct page_idle_ctrl *pic, + unsigned long start, unsigned long end) +{ + int bytes_read; + int lc = 0; /* last copy? */ + int ret; + + dump_pic(pic); + + /* Break out of loop on no more progress. */ + if (!pic->pie_read) { + lc = 1; + if (start < end) + start = end; + } + + if (start >= end && start > pic->next_hva) { + set_next_hva(start, "TAIL-HOLE"); + pic_report_addr(pic, start); + } + + bytes_read = pic->pie_read; + if (!bytes_read) + return 1; + + ret = copy_to_user(pic->buf, pic->kpie, bytes_read); + if (ret) + return -EFAULT; + + pic->buf += bytes_read; + pic->bytes_copied += bytes_read; + if (pic->bytes_copied >= pic->buf_size) + return PAGE_IDLE_BUF_FULL; + if (lc) + return lc; + + ret = init_page_idle_ctrl_buffer(pic); + if (ret) + return ret; + + cond_resched(); + return 0; +} + +#ifdef CONFIG_X86_64 +static int ept_pte_range(struct page_idle_ctrl *pic, + pmd_t *pmd, unsigned long addr, unsigned long end) +{ + pte_t *pte; + enum ProcIdlePageType page_type; + int err = 0; + + pte = pte_offset_kernel(pmd, addr); + do { + if (KVM_CHECK_INVALID_SPTE(pte->pte)) { + page_type = PTE_IDLE; + } else if (!ept_pte_present(*pte)) + page_type = PTE_HOLE; + else if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED, + (unsigned long *) &pte->pte)) + page_type = PTE_IDLE; + else { + page_type = PTE_ACCESSED; + if (pic->flags & SCAN_DIRTY_PAGE) { + if (test_and_clear_bit(_PAGE_BIT_EPT_DIRTY, + (unsigned long *) &pte->pte)) + page_type = PTE_DIRTY_M; + } + } + + err = pic_add_page(pic, addr, addr + PAGE_SIZE, page_type); + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != end); + + return err; +} + + +static int ept_pmd_range(struct page_idle_ctrl *pic, + pud_t *pud, unsigned long addr, unsigned long end) +{ + pmd_t *pmd; + unsigned long next; + enum ProcIdlePageType page_type; + enum ProcIdlePageType pte_page_type; + int err = 0; + + if (pic->flags & SCAN_HUGE_PAGE) + pte_page_type = PMD_IDLE_PTES; + else + pte_page_type = IDLE_PAGE_TYPE_MAX; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (KVM_CHECK_INVALID_SPTE(pmd->pmd)) + page_type = PMD_IDLE; + else if (!ept_pmd_present(*pmd)) + page_type = PMD_HOLE; /* likely won't hit here */ + else if (!pmd_large(*pmd)) + page_type = pte_page_type; + else if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED, + (unsigned long *)pmd)) + page_type = PMD_IDLE; + else { + page_type = PMD_ACCESSED; + if ((pic->flags & SCAN_DIRTY_PAGE) && + test_and_clear_bit(_PAGE_BIT_EPT_DIRTY, + (unsigned long *) pmd)) + page_type = PMD_DIRTY_M; + } + + if (page_type != IDLE_PAGE_TYPE_MAX) + err = pic_add_page(pic, addr, next, page_type); + else + err = ept_pte_range(pic, pmd, addr, next); + if (err) + break; + } while (pmd++, addr = next, addr != end); + + return err; +} + + +static int ept_pud_range(struct page_idle_ctrl *pic, + p4d_t *p4d, unsigned long addr, unsigned long end) +{ + pud_t *pud; + unsigned long next; + int err = 0; + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + + if (!ept_pud_present(*pud)) { + set_restart_gpa(next, "PUD_HOLE"); + continue; + } + + if (pud_large(*pud)) + err = pic_add_page(pic, addr, next, PUD_PRESENT); + else + err = ept_pmd_range(pic, pud, addr, next); + + if (err) + break; + } while (pud++, addr = next, addr != end); + + return err; +} + +static int ept_p4d_range(struct page_idle_ctrl *pic, + pgd_t *pgd, unsigned long addr, unsigned long end) +{ + p4d_t *p4d; + unsigned long next; + int err = 0; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (!ept_p4d_present(*p4d)) { + set_restart_gpa(next, "P4D_HOLE"); + continue; + } + + err = ept_pud_range(pic, p4d, addr, next); + if (err) + break; + } while (p4d++, addr = next, addr != end); + + return err; +} + + +static int ept_page_range(struct page_idle_ctrl *pic, + unsigned long addr, + unsigned long end) +{ + struct kvm_vcpu *vcpu; + struct kvm_mmu *mmu; + pgd_t *ept_root; + pgd_t *pgd; + unsigned long next; + int err = 0; + + WARN_ON(addr >= end); + + spin_lock(&pic->kvm->mmu_lock); + + vcpu = kvm_get_vcpu(pic->kvm, 0); + if (!vcpu) { + spin_unlock(&pic->kvm->mmu_lock); + return -EINVAL; + } + + mmu = kvm_arch_mmu_pointer(vcpu); + if (!VALID_PAGE(mmu->root_hpa)) { + spin_unlock(&pic->kvm->mmu_lock); + return -EINVAL; + } + + ept_root = __va(mmu->root_hpa); + + spin_unlock(&pic->kvm->mmu_lock); + local_irq_disable(); + pgd = pgd_offset_pgd(ept_root, addr); + do { + next = pgd_addr_end(addr, end); + if (!ept_pgd_present(*pgd)) { + set_restart_gpa(next, "PGD_HOLE"); + continue; + } + + err = ept_p4d_range(pic, pgd, addr, next); + if (err) + break; + } while (pgd++, addr = next, addr != end); + local_irq_enable(); + return err; +} + +static int ept_idle_supports_cpu(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + struct kvm_mmu *mmu; + int ret; + + vcpu = kvm_get_vcpu(kvm, 0); + if (!vcpu) + return -EINVAL; + + spin_lock(&kvm->mmu_lock); + mmu = kvm_arch_mmu_pointer(vcpu); + if (kvm_mmu_ad_disabled(mmu)) { + pr_notice("CPU does not support EPT A/D bits tracking\n"); + ret = -EINVAL; + } else if (mmu->shadow_root_level != 4 + (!!pgtable_l5_enabled())) { + pr_notice("Unsupported EPT level %d\n", mmu->shadow_root_level); + ret = -EINVAL; + } else + ret = 0; + spin_unlock(&kvm->mmu_lock); + + return ret; +} + +#else +static int arm_pte_range(struct page_idle_ctrl *pic, + pmd_t *pmd, unsigned long addr, unsigned long end) +{ + pte_t *pte; + enum ProcIdlePageType page_type; + int err = 0; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_present(*pte)) + page_type = PTE_HOLE; + else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, + (unsigned long *) &pte->pte)) + page_type = PTE_IDLE; + else + page_type = PTE_ACCESSED; + + err = pic_add_page(pic, addr, addr + PAGE_SIZE, page_type); + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != end); + + return err; +} + +static int arm_pmd_range(struct page_idle_ctrl *pic, + pud_t *pud, unsigned long addr, unsigned long end) +{ + pmd_t *pmd; + unsigned long next; + enum ProcIdlePageType page_type; + enum ProcIdlePageType pte_page_type; + int err = 0; + + if (pic->flags & SCAN_HUGE_PAGE) + pte_page_type = PMD_IDLE_PTES; + else + pte_page_type = IDLE_PAGE_TYPE_MAX; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (!pmd_present(*pmd)) + page_type = PMD_HOLE; + else if (!if_pmd_thp_or_huge(*pmd)) + page_type = pte_page_type; + else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, + (unsigned long *)pmd)) + page_type = PMD_IDLE; + else + page_type = PMD_ACCESSED; + + if (page_type != IDLE_PAGE_TYPE_MAX) + err = pic_add_page(pic, addr, next, page_type); + else + err = arm_pte_range(pic, pmd, addr, next); + if (err) + break; + } while (pmd++, addr = next, addr != end); + + return err; +} + +static int arm_pud_range(struct page_idle_ctrl *pic, + p4d_t *p4d, unsigned long addr, unsigned long end) +{ + pud_t *pud; + unsigned long next; + int err = 0; + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + if (!pud_present(*pud)) { + set_restart_gpa(next, "PUD_HOLE"); + continue; + } + + if (if_pud_huge(*pud)) + err = pic_add_page(pic, addr, next, PUD_PRESENT); + else + err = arm_pmd_range(pic, pud, addr, next); + if (err) + break; + } while (pud++, addr = next, addr != end); + + return err; +} + +static int arm_p4d_range(struct page_idle_ctrl *pic, + pgd_t *pgd, unsigned long addr, unsigned long end) +{ + p4d_t *p4d; + unsigned long next; + int err = 0; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (!p4d_present(*p4d)) { + set_restart_gpa(next, "P4D_HOLE"); + continue; + } + + err = arm_pud_range(pic, p4d, addr, next); + if (err) + break; + } while (p4d++, addr = next, addr != end); + + return err; +} + +static int arm_page_range(struct page_idle_ctrl *pic, + unsigned long addr, + unsigned long end) +{ + pgd_t *pgd; + unsigned long next; + struct kvm *kvm = pic->kvm; + int err = 0; + + WARN_ON(addr >= end); + + spin_lock(&pic->kvm->mmu_lock); + pgd = (pgd_t *)kvm->arch.mmu.pgt->pgd + pgd_index(addr); + spin_unlock(&pic->kvm->mmu_lock); + + local_irq_disable(); + do { + next = stage2_pgd_addr_end(kvm, addr, end); + if (!pgd_present(*pgd)) { + set_restart_gpa(next, "PGD_HOLE"); + continue; + } + + err = arm_p4d_range(pic, pgd, addr, next); + if (err) + break; + } while (pgd++, addr = next, addr != end); + + local_irq_enable(); + return err; +} +#endif + +/* + * Depending on whether hva falls in a memslot: + * + * 1) found => return gpa and remaining memslot size in *addr_range + * + * |<----- addr_range --------->| + * [ mem slot ] + * ^hva + * + * 2) not found => return hole size in *addr_range + * + * |<----- addr_range --------->| + * [first mem slot above hva ] + * ^hva + * + * If hva is above all mem slots, *addr_range will be ~0UL. + * We can finish read(2). + */ +static unsigned long vm_idle_find_gpa(struct page_idle_ctrl *pic, + unsigned long hva, + unsigned long *addr_range) +{ + struct kvm *kvm = pic->kvm; + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + unsigned long hva_end; + gfn_t gfn; + + *addr_range = ~0UL; + mutex_lock(&kvm->slots_lock); + slots = kvm_memslots(pic->kvm); + kvm_for_each_memslot(memslot, slots) { + hva_end = memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT); + + if (hva >= memslot->userspace_addr && hva < hva_end) { + gpa_t gpa; + + gfn = hva_to_gfn_memslot(hva, memslot); + *addr_range = hva_end - hva; + gpa = gfn_to_gpa(gfn); + mutex_unlock(&kvm->slots_lock); + return gpa; + } + + if (memslot->userspace_addr > hva) + *addr_range = min(*addr_range, + memslot->userspace_addr - hva); + } + mutex_unlock(&kvm->slots_lock); + return INVALID_PAGE; +} + +static int vm_idle_walk_hva_range(struct page_idle_ctrl *pic, + unsigned long start, unsigned long end) +{ + unsigned long gpa_addr; + unsigned long addr_range; + unsigned long va_end; + int ret; + +#ifdef CONFIG_X86_64 + ret = ept_idle_supports_cpu(pic->kvm); + if (ret) + return ret; +#endif + + ret = init_page_idle_ctrl_buffer(pic); + if (ret) + return ret; + + for (; start < end;) { + gpa_addr = vm_idle_find_gpa(pic, start, &addr_range); + + if (gpa_addr == INVALID_PAGE) { + pic->gpa_to_hva = 0; + if (addr_range == ~0UL) { + set_restart_gpa(TASK_SIZE, "EOF"); + va_end = end; + } else { + start += addr_range; + set_restart_gpa(start, "OUT-OF-SLOT"); + va_end = start; + } + } else { + pic->gpa_to_hva = start - gpa_addr; +#ifdef CONFIG_ARM64 + arm_page_range(pic, gpa_addr, gpa_addr + addr_range); +#else + ept_page_range(pic, gpa_addr, gpa_addr + addr_range); +#endif + va_end = pic->gpa_to_hva + gpa_addr + addr_range; + } + + start = pic->restart_gpa + pic->gpa_to_hva; + ret = page_idle_copy_user(pic, start, va_end); + if (ret) + break; + } + + if (pic->bytes_copied) + ret = 0; + return ret; +} + +static ssize_t vm_idle_read(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + struct mm_struct *mm = file->private_data; + struct page_idle_ctrl *pic; + unsigned long hva_start = *ppos; + unsigned long hva_end = hva_start + (count << (3 + PAGE_SHIFT)); + int ret; + + pic = kzalloc(sizeof(*pic), GFP_KERNEL); + if (!pic) + return -ENOMEM; + + setup_page_idle_ctrl(pic, buf, count, file->f_flags); + pic->kvm = mm_kvm(mm); + + ret = vm_idle_walk_hva_range(pic, hva_start, hva_end); + if (ret) + goto out_kvm; + + ret = pic->bytes_copied; + *ppos = pic->next_hva; +out_kvm: + return ret; + +} + +static ssize_t mm_idle_read(struct file *file, char *buf, + size_t count, loff_t *ppos); + +static ssize_t page_scan_read(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + struct mm_struct *mm = file->private_data; + unsigned long hva_start = *ppos; + unsigned long hva_end = hva_start + (count << (3 + PAGE_SHIFT)); + + if ((hva_start >= TASK_SIZE) || (hva_end >= TASK_SIZE)) { + debug_printk("page_idle_read past TASK_SIZE: %pK %pK %lx\n", + hva_start, hva_end, TASK_SIZE); + return 0; + } + if (hva_end <= hva_start) { + debug_printk("page_idle_read past EOF: %pK %pK\n", + hva_start, hva_end); + return 0; + } + if (*ppos & (PAGE_SIZE - 1)) { + debug_printk("page_idle_read unaligned ppos: %pK\n", + hva_start); + return -EINVAL; + } + if (count < PAGE_IDLE_BUF_MIN) { + debug_printk("page_idle_read small count: %lx\n", + (unsigned long)count); + return -EINVAL; + } + + if (!mm_kvm(mm)) + return mm_idle_read(file, buf, count, ppos); + + return vm_idle_read(file, buf, count, ppos); +} + +static int page_scan_open(struct inode *inode, struct file *file) +{ + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + + return 0; +} + +static int page_scan_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + struct kvm *kvm; + int ret = 0; + + if (!mm) { + ret = -EBADF; + goto out; + } + + kvm = mm_kvm(mm); + if (!kvm) { + ret = -EINVAL; + goto out; + } +#ifdef CONFIG_X86_64 + spin_lock(&kvm->mmu_lock); + kvm_flush_remote_tlbs(kvm); + spin_unlock(&kvm->mmu_lock); +#endif + +out: + module_put(THIS_MODULE); + return ret; +} + +static int mm_idle_pmd_large(pmd_t pmd) +{ +#ifdef CONFIG_ARM64 + return if_pmd_thp_or_huge(pmd); +#else + return pmd_large(pmd); +#endif +} + +static int mm_idle_pte_range(struct page_idle_ctrl *pic, pmd_t *pmd, + unsigned long addr, unsigned long next) +{ + enum ProcIdlePageType page_type; + pte_t *pte; + int err = 0; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_present(*pte)) + page_type = PTE_HOLE; + else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, + (unsigned long *) &pte->pte)) + page_type = PTE_IDLE; + else { + page_type = PTE_ACCESSED; + } + + err = pic_add_page(pic, addr, addr + PAGE_SIZE, page_type); + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != next); + + return err; +} + +static int mm_idle_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + enum ProcIdlePageType page_type; + enum ProcIdlePageType pte_page_type; + int err; + + /* + * Skip duplicate PMD_IDLE_PTES: when the PMD crosses VMA boundary, + * walk_page_range() can call on the same PMD twice. + */ + if ((addr & PMD_MASK) == (pic->last_va & PMD_MASK)) { + debug_printk("ignore duplicate addr %pK %pK\n", + addr, pic->last_va); + return 0; + } + pic->last_va = addr; + + if (pic->flags & SCAN_HUGE_PAGE) + pte_page_type = PMD_IDLE_PTES; + else + pte_page_type = IDLE_PAGE_TYPE_MAX; + + if (!pmd_present(*pmd)) + page_type = PMD_HOLE; + else if (!mm_idle_pmd_large(*pmd)) + page_type = pte_page_type; + else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, + (unsigned long *)pmd)) + page_type = PMD_IDLE; + else + page_type = PMD_ACCESSED; + + if (page_type != IDLE_PAGE_TYPE_MAX) + err = pic_add_page(pic, addr, next, page_type); + else + err = mm_idle_pte_range(pic, pmd, addr, next); + + return err; +} + +static int mm_idle_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct page_idle_ctrl *pic = walk->private; + + spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma); + + if (ptl) { + if ((addr & PUD_MASK) != (pic->last_va & PUD_MASK)) { + pic_add_page(pic, addr, next, PUD_PRESENT); + pic->last_va = addr; + } + spin_unlock(ptl); + return 1; + } + + return 0; +} + +static int mm_idle_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + + if (vma->vm_file) { + if ((vma->vm_flags & (VM_WRITE|VM_MAYSHARE)) == VM_WRITE) + return 0; + return 1; + } + + return 0; +} + +static int mm_idle_walk_range(struct page_idle_ctrl *pic, + unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma; + int ret = 0; + + ret = init_page_idle_ctrl_buffer(pic); + if (ret) + return ret; + + for (; start < end;) { + down_read(&walk->mm->mmap_lock); + vma = find_vma(walk->mm, start); + if (vma) { + if (end > vma->vm_start) { + local_irq_disable(); + ret = walk_page_range(walk->mm, start, end, + walk->ops, walk->private); + local_irq_enable(); + } else + set_restart_gpa(vma->vm_start, "VMA-HOLE"); + } else + set_restart_gpa(TASK_SIZE, "EOF"); + up_read(&walk->mm->mmap_lock); + + WARN_ONCE(pic->gpa_to_hva, "non-zero gpa_to_hva"); + start = pic->restart_gpa; + ret = page_idle_copy_user(pic, start, end); + if (ret) + break; + } + + if (pic->bytes_copied) { + if (ret != PAGE_IDLE_BUF_FULL && pic->next_hva < end) + debug_printk("partial scan: next_hva=%pK end=%pK\n", + pic->next_hva, end); + ret = 0; + } else + WARN_ONCE(1, "nothing read"); + return ret; +} + +static ssize_t mm_idle_read(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + struct mm_struct *mm = file->private_data; + struct mm_walk_ops *mm_walk_ops = NULL; + struct mm_walk mm_walk = {}; + struct page_idle_ctrl *pic; + unsigned long va_start = *ppos; + unsigned long va_end = va_start + (count << (3 + PAGE_SHIFT)); + int ret; + + if (va_end <= va_start) { + debug_printk("%s past EOF: %pK %pK\n", + __func__, va_start, va_end); + return 0; + } + if (*ppos & (PAGE_SIZE - 1)) { + debug_printk("%s unaligned ppos: %pK\n", + __func__, va_start); + return -EINVAL; + } + if (count < PAGE_IDLE_BUF_MIN) { + debug_printk("%s small count: %lx\n", + __func__, (unsigned long)count); + return -EINVAL; + } + + pic = kzalloc(sizeof(*pic), GFP_KERNEL); + if (!pic) + return -ENOMEM; + + mm_walk_ops = kzalloc(sizeof(struct mm_walk_ops), GFP_KERNEL); + if (!mm_walk_ops) { + kfree(pic); + return -ENOMEM; + } + + setup_page_idle_ctrl(pic, buf, count, file->f_flags); + + mm_walk_ops->pmd_entry = mm_idle_pmd_entry; + mm_walk_ops->pud_entry = mm_idle_pud_entry; + mm_walk_ops->test_walk = mm_idle_test_walk; + + mm_walk.mm = mm; + mm_walk.ops = mm_walk_ops; + mm_walk.private = pic; + mm_walk.pgd = NULL; + mm_walk.no_vma = false; + ret = mm_idle_walk_range(pic, va_start, va_end, &mm_walk); + if (ret) + goto out_free; + + ret = pic->bytes_copied; + *ppos = pic->next_hva; +out_free: + kfree(pic); + kfree(mm_walk_ops); + return ret; +} + +extern struct file_operations proc_page_scan_operations; + +static int page_scan_entry(void) +{ + proc_page_scan_operations.owner = THIS_MODULE; + proc_page_scan_operations.read = page_scan_read; + proc_page_scan_operations.open = page_scan_open; + proc_page_scan_operations.release = page_scan_release; + return 0; +} + +static void page_scan_exit(void) +{ + memset(&proc_page_scan_operations, 0, + sizeof(proc_page_scan_operations)); +} + +MODULE_LICENSE("GPL"); +module_init(page_scan_entry); +module_exit(page_scan_exit); diff --git a/fs/proc/memig_scan.h b/fs/proc/memig_scan.h new file mode 100644 index 000000000000..305739f92eef --- /dev/null +++ b/fs/proc/memig_scan.h @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _PAGE_IDLE_H +#define _PAGE_IDLE_H + +#define SCAN_HUGE_PAGE O_NONBLOCK /* only huge page */ +#define SCAN_SKIM_IDLE O_NOFOLLOW /* stop on PMD_IDLE_PTES */ +#define SCAN_DIRTY_PAGE O_NOATIME /* report pte/pmd dirty bit */ + +enum ProcIdlePageType { + PTE_ACCESSED, /* 4k page */ + PMD_ACCESSED, /* 2M page */ + PUD_PRESENT, /* 1G page */ + + PTE_DIRTY_M, + PMD_DIRTY_M, + + PTE_IDLE, + PMD_IDLE, + PMD_IDLE_PTES, /* all PTE idle */ + + PTE_HOLE, + PMD_HOLE, + + PIP_CMD, + + IDLE_PAGE_TYPE_MAX +}; + +#define PIP_TYPE(a) (0xf & (a >> 4)) +#define PIP_SIZE(a) (0xf & a) +#define PIP_COMPOSE(type, nr) ((type << 4) | nr) + +#define PIP_CMD_SET_HVA PIP_COMPOSE(PIP_CMD, 0) + +#ifndef INVALID_PAGE +#define INVALID_PAGE ~0UL +#endif + +#ifdef CONFIG_ARM64 +#define _PAGE_MM_BIT_ACCESSED 10 +#else +#define _PAGE_MM_BIT_ACCESSED _PAGE_BIT_ACCESSED +#endif + +#ifdef CONFIG_X86_64 +#define _PAGE_BIT_EPT_ACCESSED 8 +#define _PAGE_BIT_EPT_DIRTY 9 +#define _PAGE_EPT_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_EPT_ACCESSED) +#define _PAGE_EPT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_EPT_DIRTY) + +#define _PAGE_EPT_PRESENT (_AT(pteval_t, 7)) + +static inline int ept_pte_present(pte_t a) +{ + return pte_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_pmd_present(pmd_t a) +{ + return pmd_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_pud_present(pud_t a) +{ + return pud_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_p4d_present(p4d_t a) +{ + return p4d_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_pgd_present(pgd_t a) +{ + return pgd_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_pte_accessed(pte_t a) +{ + return pte_flags(a) & _PAGE_EPT_ACCESSED; +} + +static inline int ept_pmd_accessed(pmd_t a) +{ + return pmd_flags(a) & _PAGE_EPT_ACCESSED; +} + +static inline int ept_pud_accessed(pud_t a) +{ + return pud_flags(a) & _PAGE_EPT_ACCESSED; +} + +static inline int ept_p4d_accessed(p4d_t a) +{ + return p4d_flags(a) & _PAGE_EPT_ACCESSED; +} + +static inline int ept_pgd_accessed(pgd_t a) +{ + return pgd_flags(a) & _PAGE_EPT_ACCESSED; +} +#endif + +extern struct file_operations proc_page_scan_operations; + +#define PAGE_IDLE_KBUF_FULL 1 +#define PAGE_IDLE_BUF_FULL 2 +#define PAGE_IDLE_BUF_MIN (sizeof(uint64_t) * 2 + 3) + +#define PAGE_IDLE_KBUF_SIZE 8000 + +struct page_idle_ctrl { + struct mm_struct *mm; + struct kvm *kvm; + + uint8_t kpie[PAGE_IDLE_KBUF_SIZE]; + int pie_read; + int pie_read_max; + + void __user *buf; + int buf_size; + int bytes_copied; + + unsigned long next_hva; /* GPA for EPT; VA for PT */ + unsigned long gpa_to_hva; + unsigned long restart_gpa; + unsigned long last_va; + + unsigned int flags; +}; + +#endif diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 420510f6a545..8fdd14fb8ff9 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1770,6 +1770,72 @@ const struct file_operations proc_pagemap_operations = { .open = pagemap_open, .release = pagemap_release, }; + +/* will be filled when kvm_ept_idle module loads */ +struct file_operations proc_page_scan_operations = { +}; +EXPORT_SYMBOL_GPL(proc_page_scan_operations); + +static ssize_t mm_idle_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct mm_struct *mm = file->private_data; + int ret = 0; + + if (!mm || !mmget_not_zero(mm)) { + ret = -ESRCH; + return ret; + } + if (proc_page_scan_operations.read) + ret = proc_page_scan_operations.read(file, buf, count, ppos); + + mmput(mm); + return ret; +} + +static int mm_idle_open(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = NULL; + + if (!file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(mm)) + return PTR_ERR(mm); + + file->private_data = mm; + + if (proc_page_scan_operations.open) + return proc_page_scan_operations.open(inode, file); + + return 0; +} + +static int mm_idle_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + + if (mm) { + if (!mm_kvm(mm)) + flush_tlb_mm(mm); + mmdrop(mm); + } + + if (proc_page_scan_operations.release) + return proc_page_scan_operations.release(inode, file); + + return 0; +} + +const struct file_operations proc_mm_idle_operations = { + .llseek = mem_lseek, /* borrow this */ + .read = mm_idle_read, + .open = mm_idle_open, + .release = mm_idle_release, +}; + + #endif /* CONFIG_PROC_PAGE_MONITOR */
#ifdef CONFIG_NUMA diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 306a3d1a0fa6..8665358da427 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -31,7 +31,7 @@
struct address_space; struct mem_cgroup; - +struct kvm; /* * Each physical page in the system has a struct page associated with * it to keep track of whatever it is we are using the page for at the @@ -804,6 +804,10 @@ struct mm_struct { #endif /* CONFIG_LRU_GEN */ } __randomize_layout;
+#if IS_ENABLED(CONFIG_KVM) && !defined(__GENKSYMS__) + struct kvm *kvm; +#endif + /* * The mm_cpumask needs to be at the end of mm_struct, because it * is dynamically sized based on nr_cpu_ids. @@ -815,6 +819,18 @@ struct mm_struct { MT_FLAGS_USE_RCU) extern struct mm_struct init_mm;
+#if IS_ENABLED(CONFIG_KVM) +static inline struct kvm *mm_kvm(struct mm_struct *mm) +{ + return mm->kvm; +} +#else +static inline struct kvm *mm_kvm(struct mm_struct *mm) +{ + return NULL; +} +#endif + /* Pointer magic because the dynamic array size confuses some compilers. */ static inline void mm_init_cpumask(struct mm_struct *mm) { diff --git a/lib/Kconfig b/lib/Kconfig index 5c2da561c516..d6ab862f7040 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -728,6 +728,12 @@ config PARMAN config OBJAGG tristate "objagg" if COMPILE_TEST
+config MEMIG_SCAN_MODULE + tristate "module: memig page scan for memig support" + help + memig page scan feature + used to scan the virtual address of the target process + endmenu
config GENERIC_IOREMAP diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 624671aaa60d..0f27d48b2d45 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1969,6 +1969,7 @@ spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) spin_unlock(ptl); return NULL; } +EXPORT_SYMBOL_GPL(__pud_trans_huge_lock);
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, diff --git a/mm/pagewalk.c b/mm/pagewalk.c index cb23f8a15c13..0d39aebb432e 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -481,6 +481,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, } while (start = next, start < end); return err; } +EXPORT_SYMBOL_GPL(walk_page_range);
/** * walk_page_range_novma - walk a range of pagetables not backed by a vma diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 65f94f592ff8..905da44be082 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1297,6 +1297,9 @@ static void kvm_destroy_vm(struct kvm *kvm)
kvm_destroy_pm_notifier(kvm); kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); +#if IS_ENABLED(CONFIG_KVM) + mm->kvm = NULL; +#endif kvm_destroy_vm_debugfs(kvm); kvm_arch_sync_events(kvm); mutex_lock(&kvm_lock); @@ -5054,6 +5057,9 @@ static int kvm_dev_ioctl_create_vm(unsigned long type) * cases it will be called by the final fput(file) and will take * care of doing kvm_put_kvm(kvm). */ +#if IS_ENABLED(CONFIG_KVM) + kvm->mm->kvm = kvm; +#endif kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
fd_install(fd, file);