From: Anup Patel anup.patel@wdc.com
euleros inclusion category: feature bugzilla: NA CVE: NA
This patch implements MMU notifiers for KVM RISC-V so that Guest physical address space is in-sync with Host physical address space.
This will allow swapping, page migration, etc to work transparently with KVM RISC-V.
Link: https://gitee.com/openeuler/kernel/issues/I1RR1Y Signed-off-by: Anup Patel anup.patel@wdc.com Acked-by: Paolo Bonzini pbonzini@redhat.com Reviewed-by: Paolo Bonzini pbonzini@redhat.com Reviewed-by: Alexander Graf graf@amazon.com Signed-off-by: Mingwang Li limingwang@huawei.com Reviewed-by: Yifei Jiang jiangyifei@huawei.com Signed-off-by: Xie XiuQi xiexiuqi@huawei.com --- arch/riscv/include/asm/kvm_host.h | 7 ++ arch/riscv/kvm/Kconfig | 1 + arch/riscv/kvm/mmu.c | 129 +++++++++++++++++++++++++++++- arch/riscv/kvm/vm.c | 1 + 4 files changed, 137 insertions(+), 1 deletion(-)
diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index 9e754c0848b3..24985abbe2e7 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -200,6 +200,13 @@ static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
+#define KVM_ARCH_WANT_MMU_NOTIFIER +int kvm_unmap_hva_range(struct kvm *kvm, + unsigned long start, unsigned long end); +int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); +int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); + void __kvm_riscv_hfence_gvma_vmid_gpa(unsigned long gpa, unsigned long vmid); void __kvm_riscv_hfence_gvma_vmid(unsigned long vmid); void __kvm_riscv_hfence_gvma_gpa(unsigned long gpa); diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index 88edd477b3a8..2356dc52ebb3 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -20,6 +20,7 @@ if VIRTUALIZATION config KVM tristate "Kernel-based Virtual Machine (KVM) support (EXPERIMENTAL)" depends on RISCV_SBI && MMU + select MMU_NOTIFIER select PREEMPT_NOTIFIERS select ANON_INODES select KVM_MMIO diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 3d13e15e7555..705c341e68cb 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -369,6 +369,38 @@ int stage2_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
}
+static int handle_hva_to_gpa(struct kvm *kvm, + unsigned long start, + unsigned long end, + int (*handler)(struct kvm *kvm, + gpa_t gpa, u64 size, + void *data), + void *data) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int ret = 0; + + slots = kvm_memslots(kvm); + + /* we only care about the pages that the guest sees */ + kvm_for_each_memslot(memslot, slots) { + unsigned long hva_start, hva_end; + gfn_t gpa; + + hva_start = max(start, memslot->userspace_addr); + hva_end = min(end, memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; + + gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT; + ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data); + } + + return ret; +} + void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { } @@ -504,6 +536,95 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, return ret; }
+static int kvm_unmap_hva_handler(struct kvm *kvm, + gpa_t gpa, u64 size, void *data) +{ + stage2_unmap_range(kvm, gpa, size); + return 0; +} + +int kvm_unmap_hva_range(struct kvm *kvm, + unsigned long start, unsigned long end) +{ + if (!kvm->arch.pgd) + return 0; + + handle_hva_to_gpa(kvm, start, end, + &kvm_unmap_hva_handler, NULL); + return 0; +} + +static int kvm_set_spte_handler(struct kvm *kvm, + gpa_t gpa, u64 size, void *data) +{ + pte_t *pte = (pte_t *)data; + + WARN_ON(size != PAGE_SIZE); + stage2_set_pte(kvm, 0, NULL, gpa, pte); + + return 0; +} + +int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + unsigned long end = hva + PAGE_SIZE; + kvm_pfn_t pfn = pte_pfn(pte); + pte_t stage2_pte; + + if (!kvm->arch.pgd) + return 0; + + stage2_pte = pfn_pte(pfn, PAGE_WRITE_EXEC); + handle_hva_to_gpa(kvm, hva, end, + &kvm_set_spte_handler, &stage2_pte); + + return 0; +} + +static int kvm_age_hva_handler(struct kvm *kvm, + gpa_t gpa, u64 size, void *data) +{ + pte_t *ptep; + u32 ptep_level = 0; + + WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PGDIR_SIZE); + + if (!stage2_get_leaf_entry(kvm, gpa, &ptep, &ptep_level)) + return 0; + + return ptep_test_and_clear_young(NULL, 0, ptep); +} + +int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) +{ + if (!kvm->arch.pgd) + return 0; + + return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); +} + +static int kvm_test_age_hva_handler(struct kvm *kvm, + gpa_t gpa, u64 size, void *data) +{ + pte_t *ptep; + u32 ptep_level = 0; + + WARN_ON(size != PAGE_SIZE && size != PMD_SIZE); + if (!stage2_get_leaf_entry(kvm, gpa, &ptep, &ptep_level)) + return 0; + + return pte_young(*ptep); +} + +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + if (!kvm->arch.pgd) + return 0; + + return handle_hva_to_gpa(kvm, hva, hva, + kvm_test_age_hva_handler, NULL); +} + int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, gpa_t gpa, unsigned long hva, bool is_write) @@ -518,7 +639,7 @@ int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, struct kvm_mmu_page_cache *pcache = &vcpu->arch.mmu_page_cache; bool logging = (memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY)) ? true : false; - unsigned long vma_pagesize; + unsigned long vma_pagesize, mmu_seq;
mmap_read_lock(current->mm);
@@ -557,6 +678,8 @@ int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, return ret; }
+ mmu_seq = kvm->mmu_notifier_seq; + hfn = gfn_to_pfn_prot(kvm, gfn, is_write, &writeable); if (hfn == KVM_PFN_ERR_HWPOISON) { send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, @@ -575,6 +698,9 @@ int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu,
spin_lock(&kvm->mmu_lock);
+ if (mmu_notifier_retry(kvm, mmu_seq)) + goto out_unlock; + if (writeable) { kvm_set_pfn_dirty(hfn); mark_page_dirty(kvm, gfn); @@ -588,6 +714,7 @@ int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, if (ret) kvm_err("Failed to map in stage2\n");
+out_unlock: spin_unlock(&kvm->mmu_lock); kvm_set_pfn_accessed(hfn); kvm_release_pfn_clean(hfn); diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c index c5aab5478c38..fd84b4d914dc 100644 --- a/arch/riscv/kvm/vm.c +++ b/arch/riscv/kvm/vm.c @@ -54,6 +54,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) switch (ext) { case KVM_CAP_DEVICE_CTRL: case KVM_CAP_USER_MEMORY: + case KVM_CAP_SYNC_MMU: case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: case KVM_CAP_ONE_REG: case KVM_CAP_READONLY_MEM: