From: zhong jiang zhongjiang@huawei.com
mainline inclusion from mainline-4.20 commit f0725345e3e127032376e4fcb6b0fc893237fcef category: bugfix bugzilla: NA CVE: NA
-------------------------------------------------
There are some extra semicolon in kvm_target_cpu, remove it.
Signed-off-by: zhong jiang zhongjiang@huawei.com Signed-off-by: Marc Zyngier marc.zyngier@arm.com Signed-off-by: Xiangyou Xie xiexiangyou@huawei.com Reviewed-by: Ying Fang fangying1@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/kvm/guest.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index ff9908c5d496..ff00e1c8023a 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -432,15 +432,15 @@ int __attribute_const__ kvm_target_cpu(void) return KVM_ARM_TARGET_CORTEX_A53; case ARM_CPU_PART_CORTEX_A57: return KVM_ARM_TARGET_CORTEX_A57; - }; + } break; case ARM_CPU_IMP_APM: switch (part_number) { case APM_CPU_PART_POTENZA: return KVM_ARM_TARGET_XGENE_POTENZA; - }; + } break; - }; + }
/* Return a default generic target */ return KVM_ARM_TARGET_GENERIC_V8;
From: Paolo Bonzini pbonzini@redhat.com
mainline inclusion from mainline-5.0 commit e5d83c74a5800c2a1fa3ba982c1c4b2b39ae6db2 category: bugfix bugzilla: NA CVE: NA
-------------------------------------------------
The first such capability to be handled in virt/kvm/ will be manual dirty page reprotection.
Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Xiangyou Xie xiexiangyou@redhat.com Reviewed-by: Ying Fang fangying1@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- Documentation/virtual/kvm/api.txt | 13 +++++++++---- arch/powerpc/kvm/powerpc.c | 14 ++------------ arch/s390/kvm/kvm-s390.c | 11 +---------- arch/x86/kvm/x86.c | 14 ++------------ include/linux/kvm_host.h | 2 ++ virt/kvm/kvm_main.c | 25 +++++++++++++++++++++++++ 6 files changed, 41 insertions(+), 38 deletions(-)
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 1ed80bbf46a9..4378fbbcc30f 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1130,10 +1130,15 @@ documentation when it pops into existence).
4.37 KVM_ENABLE_CAP
-Capability: KVM_CAP_ENABLE_CAP, KVM_CAP_ENABLE_CAP_VM -Architectures: x86 (only KVM_CAP_ENABLE_CAP_VM), - mips (only KVM_CAP_ENABLE_CAP), ppc, s390 -Type: vcpu ioctl, vm ioctl (with KVM_CAP_ENABLE_CAP_VM) +Capability: KVM_CAP_ENABLE_CAP +Architectures: mips, ppc, s390 +Type: vcpu ioctl +Parameters: struct kvm_enable_cap (in) +Returns: 0 on success; -1 on error + +Capability: KVM_CAP_ENABLE_CAP_VM +Architectures: all +Type: vcpu ioctl Parameters: struct kvm_enable_cap (in) Returns: 0 on success; -1 on error
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 51cd66dc1bb0..ee89f6fbea86 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -523,7 +523,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_PPC_UNSET_IRQ: case KVM_CAP_PPC_IRQ_LEVEL: case KVM_CAP_ENABLE_CAP: - case KVM_CAP_ENABLE_CAP_VM: case KVM_CAP_ONE_REG: case KVM_CAP_IOEVENTFD: case KVM_CAP_DEVICE_CTRL: @@ -2090,8 +2089,8 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, }
-static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, - struct kvm_enable_cap *cap) +int kvm_vm_ioctl_enable_cap(struct kvm *kvm, + struct kvm_enable_cap *cap) { int r;
@@ -2271,15 +2270,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
break; } - case KVM_ENABLE_CAP: - { - struct kvm_enable_cap cap; - r = -EFAULT; - if (copy_from_user(&cap, argp, sizeof(cap))) - goto out; - r = kvm_vm_ioctl_enable_cap(kvm, &cap); - break; - } #ifdef CONFIG_SPAPR_TCE_IOMMU case KVM_CREATE_SPAPR_TCE_64: { struct kvm_create_spapr_tce_64 create_tce_64; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 18662c1a9361..c18ed934677f 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -474,7 +474,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_CSS_SUPPORT: case KVM_CAP_IOEVENTFD: case KVM_CAP_DEVICE_CTRL: - case KVM_CAP_ENABLE_CAP_VM: case KVM_CAP_S390_IRQCHIP: case KVM_CAP_VM_ATTRIBUTES: case KVM_CAP_MP_STATE: @@ -618,7 +617,7 @@ static void icpt_operexc_on_all_vcpus(struct kvm *kvm) } }
-static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) +int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { int r;
@@ -1915,14 +1914,6 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_s390_inject_vm(kvm, &s390int); break; } - case KVM_ENABLE_CAP: { - struct kvm_enable_cap cap; - r = -EFAULT; - if (copy_from_user(&cap, argp, sizeof(cap))) - break; - r = kvm_vm_ioctl_enable_cap(kvm, &cap); - break; - } case KVM_CREATE_IRQCHIP: { struct kvm_irq_routing_entry routing;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fbfe5a24188e..e53040611337 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3062,7 +3062,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_TIME: case KVM_CAP_IOAPIC_POLARITY_IGNORED: case KVM_CAP_TSC_DEADLINE_TIMER: - case KVM_CAP_ENABLE_CAP_VM: case KVM_CAP_DISABLE_QUIRKS: case KVM_CAP_SET_BOOT_CPU_ID: case KVM_CAP_SPLIT_IRQCHIP: @@ -4455,8 +4454,8 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, return 0; }
-static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, - struct kvm_enable_cap *cap) +int kvm_vm_ioctl_enable_cap(struct kvm *kvm, + struct kvm_enable_cap *cap) { int r;
@@ -4791,15 +4790,6 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; break; } - case KVM_ENABLE_CAP: { - struct kvm_enable_cap cap; - - r = -EFAULT; - if (copy_from_user(&cap, argp, sizeof(cap))) - goto out; - r = kvm_vm_ioctl_enable_cap(kvm, &cap); - break; - } case KVM_MEMORY_ENCRYPT_OP: { r = -ENOTTY; if (kvm_x86_ops->mem_enc_op) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index cec751bf6084..6d73855e0974 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -824,6 +824,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, bool line_status); +int kvm_vm_ioctl_enable_cap(struct kvm *kvm, + struct kvm_enable_cap *cap); long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 16c55b62f33d..5b4f4231292b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3185,6 +3185,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) #endif case KVM_CAP_IOEVENTFD_ANY_LENGTH: case KVM_CAP_CHECK_EXTENSION_VM: + case KVM_CAP_ENABLE_CAP_VM: return 1; #ifdef CONFIG_KVM_MMIO case KVM_CAP_COALESCED_MMIO: @@ -3204,6 +3205,21 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) return kvm_vm_ioctl_check_extension(kvm, arg); }
+int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm, + struct kvm_enable_cap *cap) +{ + return -EINVAL; +} + +static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, + struct kvm_enable_cap *cap) +{ + switch (cap->cap) { + default: + return kvm_vm_ioctl_enable_cap(kvm, cap); + } +} + static long kvm_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -3217,6 +3233,15 @@ static long kvm_vm_ioctl(struct file *filp, case KVM_CREATE_VCPU: r = kvm_vm_ioctl_create_vcpu(kvm, arg); break; + case KVM_ENABLE_CAP: { + struct kvm_enable_cap cap; + + r = -EFAULT; + if (copy_from_user(&cap, argp, sizeof(cap))) + goto out; + r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap); + break; + } case KVM_SET_USER_MEMORY_REGION: { struct kvm_userspace_memory_region kvm_userspace_mem;
From: Zenghui Yu yuzenghui@huawei.com
mainline inclusion from mainline-5.7 commit 969ce8b5260d8ec01e6f1949d2927a86419663ce category: bugfix bugzilla: NA CVE: NA
-------------------------------------------------
It's likely that the vcpu fails to handle all virtual interrupts if userspace decides to destroy it, leaving the pending ones stay in the ap_list. If the un-handled one is a LPI, its vgic_irq structure will be eventually leaked because of an extra refcount increment in vgic_queue_irq_unlock().
This was detected by kmemleak on almost every guest destroy, the backtrace is as follows:
unreferenced object 0xffff80725aed5500 (size 128): comm "CPU 5/KVM", pid 40711, jiffies 4298024754 (age 166366.512s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 08 01 a9 73 6d 80 ff ff ...........sm... c8 61 ee a9 00 20 ff ff 28 1e 55 81 6c 80 ff ff .a... ..(.U.l... backtrace: [<000000004bcaa122>] kmem_cache_alloc_trace+0x2dc/0x418 [<0000000069c7dabb>] vgic_add_lpi+0x88/0x418 [<00000000bfefd5c5>] vgic_its_cmd_handle_mapi+0x4dc/0x588 [<00000000cf993975>] vgic_its_process_commands.part.5+0x484/0x1198 [<000000004bd3f8e3>] vgic_its_process_commands+0x50/0x80 [<00000000b9a65b2b>] vgic_mmio_write_its_cwriter+0xac/0x108 [<0000000009641ebb>] dispatch_mmio_write+0xd0/0x188 [<000000008f79d288>] __kvm_io_bus_write+0x134/0x240 [<00000000882f39ac>] kvm_io_bus_write+0xe0/0x150 [<0000000078197602>] io_mem_abort+0x484/0x7b8 [<0000000060954e3c>] kvm_handle_guest_abort+0x4cc/0xa58 [<00000000e0d0cd65>] handle_exit+0x24c/0x770 [<00000000b44a7fad>] kvm_arch_vcpu_ioctl_run+0x460/0x1988 [<0000000025fb897c>] kvm_vcpu_ioctl+0x4f8/0xee0 [<000000003271e317>] do_vfs_ioctl+0x160/0xcd8 [<00000000e7f39607>] ksys_ioctl+0x98/0xd8
Fix it by retiring all pending LPIs in the ap_list on the destroy path.
p.s. I can also reproduce it on a normal guest shutdown. It is because userspace still send LPIs to vcpu (through KVM_SIGNAL_MSI ioctl) while the guest is being shutdown and unable to handle it. A little strange though and haven't dig further...
Reviewed-by: James Morse james.morse@arm.com Signed-off-by: Zenghui Yu yuzenghui@huawei.com [maz: moved the distributor deallocation down to avoid an UAF splat] Signed-off-by: Marc Zyngier maz@kernel.org Signed-off-by: Xiangyou Xie xiexiangyou@huawei.com Reviewed-by: Ying Fang fangying1@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Link: https://lore.kernel.org/r/20200414030349.625-2-yuzenghui@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- virt/kvm/arm/vgic/vgic-init.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c index 1f39c61428f9..0d11d09a37e0 100644 --- a/virt/kvm/arm/vgic/vgic-init.c +++ b/virt/kvm/arm/vgic/vgic-init.c @@ -375,6 +375,12 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ /* + * Retire all pending LPIs on this vcpu anyway as we're + * going to destroy it. + */ + vgic_flush_pending_lpis(vcpu); + INIT_LIST_HEAD(&vgic_cpu->ap_list_head); }
@@ -386,10 +392,10 @@ static void __kvm_vgic_destroy(struct kvm *kvm)
vgic_debug_destroy(kvm);
- kvm_vgic_dist_destroy(kvm); - kvm_for_each_vcpu(i, vcpu, kvm) kvm_vgic_vcpu_destroy(vcpu); + + kvm_vgic_dist_destroy(kvm); }
void kvm_vgic_destroy(struct kvm *kvm)
From: Paolo Bonzini pbonzini@redhat.com
mainline inclusion from mainline-5.0 commit 8fe65a8299f9e1f40cb95308ab7b3c4ad80bf801 category: bugfix bugzilla: NA CVE: NA
-------------------------------------------------
When manual dirty log reprotect will be enabled, kvm_get_dirty_log_protect's pointer argument will always be false on exit, because no TLB flush is needed until the manual re-protection operation. Rename it from "is_dirty" to "flush", which more accurately tells the caller what they have to do with it.
Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Xiangyou Xie xiexiangyou@huawei.com Reviewed-by: Ying Fang fangying1@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/mips/kvm/mips.c | 6 +++--- arch/x86/kvm/x86.c | 6 +++--- include/linux/kvm_host.h | 2 +- virt/kvm/arm/arm.c | 6 +++--- virt/kvm/kvm_main.c | 6 +++--- 5 files changed, 13 insertions(+), 13 deletions(-)
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 79485790f7b5..a8331c358e34 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1006,14 +1006,14 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; - bool is_dirty = false; + bool flush = false; int r;
mutex_lock(&kvm->slots_lock);
- r = kvm_get_dirty_log_protect(kvm, log, &is_dirty); + r = kvm_get_dirty_log_protect(kvm, log, &flush);
- if (is_dirty) { + if (flush) { slots = kvm_memslots(kvm); memslot = id_to_memslot(slots, log->slot);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e53040611337..02a62fc42a04 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4417,7 +4417,7 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, */ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - bool is_dirty = false; + bool flush = false; int r;
mutex_lock(&kvm->slots_lock); @@ -4428,14 +4428,14 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) if (kvm_x86_ops->flush_log_dirty) kvm_x86_ops->flush_log_dirty(kvm);
- r = kvm_get_dirty_log_protect(kvm, log, &is_dirty); + r = kvm_get_dirty_log_protect(kvm, log, &flush);
/* * All the TLBs can be flushed out of mmu lock, see the comments in * kvm_mmu_slot_remove_write_access(). */ lockdep_assert_held(&kvm->slots_lock); - if (is_dirty) + if (flush) kvm_flush_remote_tlbs(kvm);
mutex_unlock(&kvm->slots_lock); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6d73855e0974..3b4c373b7c8a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -812,7 +812,7 @@ int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, int *is_dirty);
int kvm_get_dirty_log_protect(struct kvm *kvm, - struct kvm_dirty_log *log, bool *is_dirty); + struct kvm_dirty_log *log, bool *flush);
void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 4a5765277646..ec7060ad909b 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -1297,14 +1297,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp, */ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - bool is_dirty = false; + bool flush = false; int r;
mutex_lock(&kvm->slots_lock);
- r = kvm_get_dirty_log_protect(kvm, log, &is_dirty); + r = kvm_get_dirty_log_protect(kvm, log, &flush);
- if (is_dirty) + if (flush) kvm_flush_remote_tlbs(kvm);
mutex_unlock(&kvm->slots_lock); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5b4f4231292b..ced4277f1223 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1214,7 +1214,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log); * */ int kvm_get_dirty_log_protect(struct kvm *kvm, - struct kvm_dirty_log *log, bool *is_dirty) + struct kvm_dirty_log *log, bool *flush) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; @@ -1241,7 +1241,7 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, memset(dirty_bitmap_buffer, 0, n);
spin_lock(&kvm->mmu_lock); - *is_dirty = false; + *flush = false; for (i = 0; i < n / sizeof(long); i++) { unsigned long mask; gfn_t offset; @@ -1249,7 +1249,7 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, if (!dirty_bitmap[i]) continue;
- *is_dirty = true; + *flush = true;
mask = xchg(&dirty_bitmap[i], 0); dirty_bitmap_buffer[i] = mask;
From: Paolo Bonzini pbonzini@redhat.com
mainline inclusion from mainline-5.0 commit 2a31b9db153530df4aa02dac8c32837bf5f47019 category: feature bugzilla: NA CVE: NA
-------------------------------------------------
There are two problems with KVM_GET_DIRTY_LOG. First, and less important, it can take kvm->mmu_lock for an extended period of time. Second, its user can actually see many false positives in some cases. The latter is due to a benign race like this:
1. KVM_GET_DIRTY_LOG returns a set of dirty pages and write protects them. 2. The guest modifies the pages, causing them to be marked ditry. 3. Userspace actually copies the pages. 4. KVM_GET_DIRTY_LOG returns those pages as dirty again, even though they were not written to since (3).
This is especially a problem for large guests, where the time between (1) and (3) can be substantial. This patch introduces a new capability which, when enabled, makes KVM_GET_DIRTY_LOG not write-protect the pages it returns. Instead, userspace has to explicitly clear the dirty log bits just before using the content of the page. The new KVM_CLEAR_DIRTY_LOG ioctl can also operate on a 64-page granularity rather than requiring to sync a full memslot; this way, the mmu_lock is taken for small amounts of time, and only a small amount of time will pass between write protection of pages and the sending of their content.
Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Xiangyou Xie xiexiangyou@huawei.com Reviewed-by: Ying Fang fangying1@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- Documentation/virtual/kvm/api.txt | 70 +++++++++- arch/mips/kvm/mips.c | 23 ++++ arch/x86/kvm/x86.c | 27 ++++ include/linux/kvm_host.h | 7 + include/uapi/linux/kvm.h | 15 ++ tools/testing/selftests/kvm/Makefile | 3 + .../selftests/kvm/clear_dirty_log_test.c | 2 + tools/testing/selftests/kvm/dirty_log_test.c | 19 +++ .../testing/selftests/kvm/include/kvm_util.h | 2 + tools/testing/selftests/kvm/lib/kvm_util.c | 13 ++ virt/kvm/arm/arm.c | 16 +++ virt/kvm/kvm_main.c | 130 ++++++++++++++++-- 12 files changed, 311 insertions(+), 16 deletions(-) create mode 100644 tools/testing/selftests/kvm/clear_dirty_log_test.c
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 4378fbbcc30f..e46a7166eb47 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -311,6 +311,10 @@ the address space for which you want to return the dirty bitmap. They must be less than the value that KVM_CHECK_EXTENSION returns for the KVM_CAP_MULTI_ADDRESS_SPACE capability.
+The bits in the dirty bitmap are cleared before the ioctl returns, unless +KVM_CAP_MANUAL_DIRTY_LOG_PROTECT is enabled. For more information, +see the description of the capability. +
4.9 KVM_SET_MEMORY_ALIAS
@@ -3726,6 +3730,46 @@ Returns: 0 on success, -1 on error This copies the vcpu's kvm_nested_state struct from userspace to the kernel. For the definition of struct kvm_nested_state, see KVM_GET_NESTED_STATE.
+4.117 KVM_CLEAR_DIRTY_LOG (vm ioctl) + +Capability: KVM_CAP_MANUAL_DIRTY_LOG_PROTECT +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_dirty_log (in) +Returns: 0 on success, -1 on error + +/* for KVM_CLEAR_DIRTY_LOG */ +struct kvm_clear_dirty_log { + __u32 slot; + __u32 num_pages; + __u64 first_page; + union { + void __user *dirty_bitmap; /* one bit per page */ + __u64 padding; + }; +}; + +The ioctl clears the dirty status of pages in a memory slot, according to +the bitmap that is passed in struct kvm_clear_dirty_log's dirty_bitmap +field. Bit 0 of the bitmap corresponds to page "first_page" in the +memory slot, and num_pages is the size in bits of the input bitmap. +Both first_page and num_pages must be a multiple of 64. For each bit +that is set in the input bitmap, the corresponding page is marked "clean" +in KVM's dirty bitmap, and dirty tracking is re-enabled for that page +(for example via write-protection, or by clearing the dirty bit in +a page table entry). + +If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 specifies +the address space for which you want to return the dirty bitmap. +They must be less than the value that KVM_CHECK_EXTENSION returns for +the KVM_CAP_MULTI_ADDRESS_SPACE capability. + +This ioctl is mostly useful when KVM_CAP_MANUAL_DIRTY_LOG_PROTECT +is enabled; for more information, see the description of the capability. +However, it can always be used as long as KVM_CHECK_EXTENSION confirms +that KVM_CAP_MANUAL_DIRTY_LOG_PROTECT is present. + + 5. The kvm_run structure ------------------------
@@ -4574,7 +4618,7 @@ hpage module parameter is not set to 1, -EINVAL is returned. While it is generally possible to create a huge page backed VM without this capability, the VM will not be able to run.
-7.14 KVM_CAP_MSR_PLATFORM_INFO +7.15 KVM_CAP_MSR_PLATFORM_INFO
Architectures: x86 Parameters: args[0] whether feature should be enabled or not @@ -4583,6 +4627,30 @@ With this capability, a guest may read the MSR_PLATFORM_INFO MSR. Otherwise, a #GP would be raised when the guest tries to access. Currently, this capability does not enable write permissions of this MSR for the guest.
+7.16 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT + +Architectures: all +Parameters: args[0] whether feature should be enabled or not + +With this capability enabled, KVM_GET_DIRTY_LOG will not automatically +clear and write-protect all pages that are returned as dirty. +Rather, userspace will have to do this operation separately using +KVM_CLEAR_DIRTY_LOG. + +At the cost of a slightly more complicated operation, this provides better +scalability and responsiveness for two reasons. First, +KVM_CLEAR_DIRTY_LOG ioctl can operate on a 64-page granularity rather +than requiring to sync a full memslot; this ensures that KVM does not +take spinlocks for an extended period of time. Second, in some cases a +large amount of time can pass between a call to KVM_GET_DIRTY_LOG and +userspace actually using the data in the page. Pages can be modified +during this time, which is inefficint for both the guest and userspace: +the guest will incur a higher penalty due to write protection faults, +while userspace can see false reports of dirty pages. Manual reprotection +helps reducing this time, improving guest performance and reducing the +number of dirty log false positives. + + 8. Other capabilities. ----------------------
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index a8331c358e34..6238d072c5e0 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1025,6 +1025,29 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) return r; }
+int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + bool flush = false; + int r; + + mutex_lock(&kvm->slots_lock); + + r = kvm_clear_dirty_log_protect(kvm, log, &flush); + + if (flush) { + slots = kvm_memslots(kvm); + memslot = id_to_memslot(slots, log->slot); + + /* Let implementation handle TLB/GVA invalidation */ + kvm_mips_callbacks->flush_shadow_memslot(kvm, memslot); + } + + mutex_unlock(&kvm->slots_lock); + return r; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { long r; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 02a62fc42a04..2b45de3dc2ba 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4442,6 +4442,33 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) return r; }
+int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log) +{ + bool flush = false; + int r; + + mutex_lock(&kvm->slots_lock); + + /* + * Flush potentially hardware-cached dirty pages to dirty_bitmap. + */ + if (kvm_x86_ops->flush_log_dirty) + kvm_x86_ops->flush_log_dirty(kvm); + + r = kvm_clear_dirty_log_protect(kvm, log, &flush); + + /* + * All the TLBs can be flushed out of mmu lock, see the comments in + * kvm_mmu_slot_remove_write_access(). + */ + lockdep_assert_held(&kvm->slots_lock); + if (flush) + kvm_flush_remote_tlbs(kvm); + + mutex_unlock(&kvm->slots_lock); + return r; +} + int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, bool line_status) { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3b4c373b7c8a..214ae642022e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -477,6 +477,7 @@ struct kvm { #endif long tlbs_dirty; struct list_head devices; + bool manual_dirty_log_protect; struct dentry *debugfs_dentry; struct kvm_stat_data **debugfs_stat_data; struct srcu_struct srcu; @@ -814,6 +815,9 @@ int kvm_get_dirty_log(struct kvm *kvm, int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log, bool *flush);
+int kvm_clear_dirty_log_protect(struct kvm *kvm, + struct kvm_clear_dirty_log *log, bool *flush); + void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, @@ -822,6 +826,9 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log);
+int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, + struct kvm_clear_dirty_log *log); + int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, bool line_status); int kvm_vm_ioctl_enable_cap(struct kvm *kvm, diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 723ba0682468..ddc13d703bfc 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -488,6 +488,17 @@ struct kvm_dirty_log { }; };
+/* for KVM_CLEAR_DIRTY_LOG */ +struct kvm_clear_dirty_log { + __u32 slot; + __u32 num_pages; + __u64 first_page; + union { + void __user *dirty_bitmap; /* one bit per page */ + __u64 padding2; + }; +}; + /* for KVM_SET_SIGNAL_MASK */ struct kvm_signal_mask { __u32 len; @@ -966,6 +977,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_INJECT_SERROR_ESR 158 #define KVM_CAP_MSR_PLATFORM_INFO 159 #define KVM_CAP_ARM_VM_IPA_SIZE 165 /* returns maximum IPA bits for a VM */ +#define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166 #define KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 174
#define KVM_CAP_ARM_CPU_FEATURE 555 @@ -1426,6 +1438,9 @@ struct kvm_enc_region { #define KVM_GET_NESTED_STATE _IOWR(KVMIO, 0xbe, struct kvm_nested_state) #define KVM_SET_NESTED_STATE _IOW(KVMIO, 0xbf, struct kvm_nested_state)
+/* Available with KVM_CAP_MANUAL_DIRTY_LOG_PROTECT */ +#define KVM_CLEAR_DIRTY_LOG _IOWR(KVMIO, 0xc0, struct kvm_clear_dirty_log) + /* Secure Encrypted Virtualization command */ enum sev_cmd_id { /* Guest initialization commands */ diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index cc83e2fd3787..459c19b4a244 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -14,6 +14,9 @@ TEST_GEN_PROGS_x86_64 += vmx_tsc_adjust_test TEST_GEN_PROGS_x86_64 += cr4_cpuid_sync_test TEST_GEN_PROGS_x86_64 += state_test TEST_GEN_PROGS_x86_64 += dirty_log_test +TEST_GEN_PROGS_x86_64 += clear_dirty_log_test + +TEST_GEN_PROGS_aarch64 += clear_dirty_log_test
TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M)) LIBKVM += $(LIBKVM_$(UNAME_M)) diff --git a/tools/testing/selftests/kvm/clear_dirty_log_test.c b/tools/testing/selftests/kvm/clear_dirty_log_test.c new file mode 100644 index 000000000000..749336937d37 --- /dev/null +++ b/tools/testing/selftests/kvm/clear_dirty_log_test.c @@ -0,0 +1,2 @@ +#define USE_CLEAR_DIRTY_LOG +#include "dirty_log_test.c" diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index a9c4b5e21d7e..378b86028e45 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -230,6 +230,13 @@ int main(int argc, char *argv[]) interval = TEST_HOST_LOOP_INTERVAL; int opt;
+#ifdef USE_CLEAR_DIRTY_LOG + if (!kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT)) { + fprintf(stderr, "KVM_CLEAR_DIRTY_LOG not available, skipping tests\n"); + exit(KSFT_SKIP); + } +#endif + while ((opt = getopt(argc, argv, "hi:I:")) != -1) { switch (opt) { case 'i': @@ -258,6 +265,14 @@ int main(int argc, char *argv[])
vm = vm_create_default(VCPU_ID, TEST_MEM_PAGES, guest_code);
+#ifdef USE_CLEAR_DIRTY_LOG + struct kvm_enable_cap cap = {}; + + cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT; + cap.args[0] = 1; + vm_enable_cap(vm, &cap); +#endif + /* Add an extra memory slot for testing dirty logging */ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, TEST_MEM_OFFSET, @@ -288,6 +303,10 @@ int main(int argc, char *argv[]) /* Give the vcpu thread some time to dirty some pages */ usleep(interval * 1000); kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); +#ifdef USE_CLEAR_DIRTY_LOG + kvm_vm_clear_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap, 0, + DIV_ROUND_UP(host_num_pages, 64) * 64); +#endif vm_dirty_log_verify(bmap, *iteration); (*iteration)++; } diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 3acf9a91704c..feefa8945b53 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -57,6 +57,8 @@ void kvm_vm_free(struct kvm_vm *vmp); void kvm_vm_restart(struct kvm_vm *vmp, int perm); void kvm_vm_release(struct kvm_vm *vmp); void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log); +void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, + uint64_t first_page, uint32_t num_pages);
int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva, size_t len); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index fb5d2d1e0c04..3edcb1aa12ac 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -202,6 +202,19 @@ void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log) strerror(-ret)); }
+void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, + uint64_t first_page, uint32_t num_pages) +{ + struct kvm_clear_dirty_log args = { .dirty_bitmap = log, .slot = slot, + .first_page = first_page, + .num_pages = num_pages }; + int ret; + + ret = ioctl(vm->fd, KVM_CLEAR_DIRTY_LOG, &args); + TEST_ASSERT(ret == 0, "%s: KVM_CLEAR_DIRTY_LOG failed: %s", + strerror(-ret)); +} + /* Userspace Memory Region Find * * Input Args: diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index ec7060ad909b..8e04e7ee7d08 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -1311,6 +1311,22 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) return r; }
+int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log) +{ + bool flush = false; + int r; + + mutex_lock(&kvm->slots_lock); + + r = kvm_clear_dirty_log_protect(kvm, log, &flush); + + if (flush) + kvm_flush_remote_tlbs(kvm); + + mutex_unlock(&kvm->slots_lock); + return r; +} + static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ced4277f1223..c3604286529d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1193,7 +1193,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log); #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT /** * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages - * are dirty write protect them for next write. + * and reenable dirty page tracking for the corresponding pages. * @kvm: pointer to kvm instance * @log: slot id and address to which we copy the log * @is_dirty: flag set if any page is dirty @@ -1236,37 +1236,116 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, return -ENOENT;
n = kvm_dirty_bitmap_bytes(memslot); + *flush = false; + if (kvm->manual_dirty_log_protect) { + /* + * Unlike kvm_get_dirty_log, we always return false in *flush, + * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There + * is some code duplication between this function and + * kvm_get_dirty_log, but hopefully all architecture + * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log + * can be eliminated. + */ + dirty_bitmap_buffer = dirty_bitmap; + } else { + dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); + memset(dirty_bitmap_buffer, 0, n); + spin_lock(&kvm->mmu_lock); + for (i = 0; i < n / sizeof(long); i++) { + unsigned long mask; + gfn_t offset; + + if (!dirty_bitmap[i]) + continue; + + *flush = true; + mask = xchg(&dirty_bitmap[i], 0); + dirty_bitmap_buffer[i] = mask; + + if (mask) { + offset = i * BITS_PER_LONG; + kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, + offset, mask); + } + } + spin_unlock(&kvm->mmu_lock); + } + + if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); + +/** + * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap + * and reenable dirty page tracking for the corresponding pages. + * @kvm: pointer to kvm instance + * @log: slot id and address from which to fetch the bitmap of dirty pages + */ +int kvm_clear_dirty_log_protect(struct kvm *kvm, + struct kvm_clear_dirty_log *log, bool *flush) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int as_id, id, n; + gfn_t offset; + unsigned long i; + unsigned long *dirty_bitmap; + unsigned long *dirty_bitmap_buffer; + + as_id = log->slot >> 16; + id = (u16)log->slot; + if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) + return -EINVAL; + + if ((log->first_page & 63) || (log->num_pages & 63)) + return -EINVAL; + + slots = __kvm_memslots(kvm, as_id); + memslot = id_to_memslot(slots, id); + + dirty_bitmap = memslot->dirty_bitmap; + if (!dirty_bitmap) + return -ENOENT; + + n = kvm_dirty_bitmap_bytes(memslot); + *flush = false;
dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); - memset(dirty_bitmap_buffer, 0, n); + if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n)) + return -EFAULT;
spin_lock(&kvm->mmu_lock); - *flush = false; - for (i = 0; i < n / sizeof(long); i++) { - unsigned long mask; - gfn_t offset; + for (offset = log->first_page, + i = offset / BITS_PER_LONG, n = log->num_pages / BITS_PER_LONG; n--; + i++, offset += BITS_PER_LONG) { + unsigned long mask = *dirty_bitmap_buffer++; + atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
- if (!dirty_bitmap[i]) + if (!mask) continue;
- *flush = true; - - mask = xchg(&dirty_bitmap[i], 0); - dirty_bitmap_buffer[i] = mask; + mask &= atomic_long_fetch_andnot(mask, p);
+ /* + * mask contains the bits that really have been cleared. This + * never includes any bits beyond the length of the memslot (if + * the length is not aligned to 64 pages), therefore it is not + * a problem if userspace sets them in log->dirty_bitmap. + */ if (mask) { - offset = i * BITS_PER_LONG; + *flush = true; kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask); } }
spin_unlock(&kvm->mmu_lock); - if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) - return -EFAULT; + return 0; } -EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); +EXPORT_SYMBOL_GPL(kvm_clear_dirty_log_protect); #endif
bool kvm_largepages_enabled(void) @@ -3186,6 +3265,9 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_IOEVENTFD_ANY_LENGTH: case KVM_CAP_CHECK_EXTENSION_VM: case KVM_CAP_ENABLE_CAP_VM: +#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT + case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT: +#endif return 1; #ifdef CONFIG_KVM_MMIO case KVM_CAP_COALESCED_MMIO: @@ -3215,6 +3297,13 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, struct kvm_enable_cap *cap) { switch (cap->cap) { +#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT + case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT: + if (cap->flags || (cap->args[0] & ~1)) + return -EINVAL; + kvm->manual_dirty_log_protect = cap->args[0]; + return 0; +#endif default: return kvm_vm_ioctl_enable_cap(kvm, cap); } @@ -3262,6 +3351,17 @@ static long kvm_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_get_dirty_log(kvm, &log); break; } +#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT + case KVM_CLEAR_DIRTY_LOG: { + struct kvm_clear_dirty_log log; + + r = -EFAULT; + if (copy_from_user(&log, argp, sizeof(log))) + goto out; + r = kvm_vm_ioctl_clear_dirty_log(kvm, &log); + break; +} +#endif #ifdef CONFIG_KVM_MMIO case KVM_REGISTER_COALESCED_MMIO: { struct kvm_coalesced_mmio_zone zone;
From: Tomas Bortoli tomasbortoli@gmail.com
mainline inclusion from mainline-5.0 commit 98938aa8edd66dc95024d7c936a4bc315f6615ff category: bugfix bugzilla: NA CVE: NA
-------------------------------------------------
The function at issue does not fully validate the content of the structure pointed by the log parameter, though its content has just been copied from userspace and lacks validation. Fix that.
Moreover, change the type of n to unsigned long as that is the type returned by kvm_dirty_bitmap_bytes().
Signed-off-by: Tomas Bortoli tomasbortoli@gmail.com Reported-by: syzbot+028366e52c9ace67deb3@syzkaller.appspotmail.com [Squashed the fix from Paolo. - Radim.] Signed-off-by: Radim Krčmář rkrcmar@redhat.com Signed-off-by: Xiangyou Xie xiexiangyou@huawei.com Reviewed-by: Ying Fang fangying1@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- virt/kvm/kvm_main.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c3604286529d..9f4d87702c2e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1288,9 +1288,9 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm, { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; - int as_id, id, n; + int as_id, id; gfn_t offset; - unsigned long i; + unsigned long i, n; unsigned long *dirty_bitmap; unsigned long *dirty_bitmap_buffer;
@@ -1310,6 +1310,11 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm, return -ENOENT;
n = kvm_dirty_bitmap_bytes(memslot); + + if (log->first_page > memslot->npages || + log->num_pages > memslot->npages - log->first_page) + return -EINVAL; + *flush = false;
dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
From: Lan Tianyu Tianyu.Lan@microsoft.com
mainline inclusion from mainline-5.1 commit a67794cafbc4594debf53dbe4e2a7708426f492e category: bugfix bugzilla: NA CVE: NA
-------------------------------------------------
The value of "dirty_bitmap[i]" is already check before setting its value to mask. The following check of "mask" is redundant. The check of "mask" was introduced by commit 58d2930f4ee3 ("KVM: Eliminate extra function calls in kvm_get_dirty_log_protect()"), revert it.
Signed-off-by: Lan Tianyu Tianyu.Lan@microsoft.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Xiangyou Xie xiexiangyou@huawei.com Reviewed-by: Ying Fang fangying1@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- virt/kvm/kvm_main.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9f4d87702c2e..d7b8620ec77c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1262,11 +1262,9 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, mask = xchg(&dirty_bitmap[i], 0); dirty_bitmap_buffer[i] = mask;
- if (mask) { - offset = i * BITS_PER_LONG; - kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, - offset, mask); - } + offset = i * BITS_PER_LONG; + kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, + offset, mask); } spin_unlock(&kvm->mmu_lock); }
From: Paolo Bonzini pbonzini@redhat.com
mainline inclusion from mainline-5.2 commit 65c4189de8c1d995f6bc2cc96b22206405466b53 category: bugfix bugzilla: NA CVE: NA
-------------------------------------------------
If a memory slot's size is not a multiple of 64 pages (256K), then the KVM_CLEAR_DIRTY_LOG API is unusable: clearing the final 64 pages either requires the requested page range to go beyond memslot->npages, or requires log->num_pages to be unaligned, and kvm_clear_dirty_log_protect requires log->num_pages to be both in range and aligned.
To allow this case, allow log->num_pages not to be a multiple of 64 if it ends exactly on the last page of the slot.
Reported-by: Peter Xu peterx@redhat.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Xiangyou Xie xiexiangyou@huawei.com Reviewed-by: Ying Fang fangying1@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- Documentation/virtual/kvm/api.txt | 5 +++-- virt/kvm/kvm_main.c | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index e46a7166eb47..60b5094fec04 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3753,8 +3753,9 @@ The ioctl clears the dirty status of pages in a memory slot, according to the bitmap that is passed in struct kvm_clear_dirty_log's dirty_bitmap field. Bit 0 of the bitmap corresponds to page "first_page" in the memory slot, and num_pages is the size in bits of the input bitmap. -Both first_page and num_pages must be a multiple of 64. For each bit -that is set in the input bitmap, the corresponding page is marked "clean" +first_page must be a multiple of 64; num_pages must also be a multiple of +64 unless first_page + num_pages is the size of the memory slot. For each +bit that is set in the input bitmap, the corresponding page is marked "clean" in KVM's dirty bitmap, and dirty tracking is re-enabled for that page (for example via write-protection, or by clearing the dirty bit in a page table entry). diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d7b8620ec77c..c0fb9dc09284 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1297,7 +1297,7 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm, if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) return -EINVAL;
- if ((log->first_page & 63) || (log->num_pages & 63)) + if (log->first_page & 63) return -EINVAL;
slots = __kvm_memslots(kvm, as_id); @@ -1310,7 +1310,8 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm, n = kvm_dirty_bitmap_bytes(memslot);
if (log->first_page > memslot->npages || - log->num_pages > memslot->npages - log->first_page) + log->num_pages > memslot->npages - log->first_page || + (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63))) return -EINVAL;
*flush = false;
From: Jiang Biao benbjiang@tencent.com
mainline inclusion from mainline-5.2 commit b8b002209c061273fd1ef7bb3c3c32301623a282 category: bugfix bugzilla: NA CVE: NA
-------------------------------------------------
is_dirty has been renamed to flush, but the comment for it is outdated. And the description about @flush parameter for kvm_clear_dirty_log_protect() is missing, add it in this patch as well.
Signed-off-by: Jiang Biao benbjiang@tencent.com Reviewed-by: Cornelia Huck cohuck@redhat.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Xiangyou Xie xiexiangyou@huawei.com Reviewed-by: Ying Fang fangying1@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- virt/kvm/kvm_main.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c0fb9dc09284..54ca97c6cbac 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1192,11 +1192,11 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT /** - * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages + * kvm_get_dirty_log_protect - get a snapshot of dirty pages * and reenable dirty page tracking for the corresponding pages. * @kvm: pointer to kvm instance * @log: slot id and address to which we copy the log - * @is_dirty: flag set if any page is dirty + * @flush: true if TLB flush is needed by caller * * We need to keep it in mind that VCPU threads can write to the bitmap * concurrently. So, to avoid losing track of dirty pages we keep the @@ -1280,6 +1280,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); * and reenable dirty page tracking for the corresponding pages. * @kvm: pointer to kvm instance * @log: slot id and address from which to fetch the bitmap of dirty pages + * @flush: true if TLB flush is needed by caller */ int kvm_clear_dirty_log_protect(struct kvm *kvm, struct kvm_clear_dirty_log *log, bool *flush)
From: Peter Xu peterx@redhat.com
mainline inclusion from mainline-5.2 commit 4ddc9204572c33f2eb91fbdb1d99d8078388b67d category: bugfix bugzilla: NA CVE: NA
-------------------------------------------------
kvm_dirty_bitmap_bytes() will return the size of the dirty bitmap of the memslot rather than the size of bitmap passed over from the ioctl. Here for KVM_CLEAR_DIRTY_LOG we should only copy exactly the size of bitmap that covers kvm_clear_dirty_log.num_pages.
Signed-off-by: Peter Xu peterx@redhat.com Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Xiangyou Xie xiexiangyou@huawei.com Reviewed-by: Ying Fang fangying1@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 54ca97c6cbac..c2dc06203d0d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1308,7 +1308,7 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm, if (!dirty_bitmap) return -ENOENT;
- n = kvm_dirty_bitmap_bytes(memslot); + n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
if (log->first_page > memslot->npages || log->num_pages > memslot->npages - log->first_page ||
From: Peter Xu peterx@redhat.com
mainline inclusion from mainline-5.2 commit 53eac7a8f8cf3d7dc5ecac1946f31442f5eee5f3 category: bugfix bugzilla: NA CVE: NA
-------------------------------------------------
Just imaging the case where num_pages < BITS_PER_LONG, then the loop will be skipped while it shouldn't.
Signed-off-by: Peter Xu peterx@redhat.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Xiangyou Xie xiexiangyou@huawei.com Reviewed-by: Ying Fang fangying1@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- virt/kvm/kvm_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c2dc06203d0d..134895e26e3d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1322,8 +1322,8 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm, return -EFAULT;
spin_lock(&kvm->mmu_lock); - for (offset = log->first_page, - i = offset / BITS_PER_LONG, n = log->num_pages / BITS_PER_LONG; n--; + for (offset = log->first_page, i = offset / BITS_PER_LONG, + n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--; i++, offset += BITS_PER_LONG) { unsigned long mask = *dirty_bitmap_buffer++; atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
From: Peter Xu peterx@redhat.com
mainline inclusion from mainline-5.2 commit d7547c55cbe7471255ca51f14bcd4699f5eaabe5 category: bugfix bugzilla: NA CVE: NA
-------------------------------------------------
The previous KVM_CAP_MANUAL_DIRTY_LOG_PROTECT has some problem which blocks the correct usage from userspace. Obsolete the old one and introduce a new capability bit for it.
Suggested-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Peter Xu peterx@redhat.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Xiangyou Xie xiexiangyou@huawei.com Reviewed-by: Ying Fang fangying1@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- Documentation/virtual/kvm/api.txt | 15 ++++++++++----- include/uapi/linux/kvm.h | 5 +++-- tools/testing/selftests/kvm/dirty_log_test.c | 4 ++-- virt/kvm/kvm_main.c | 4 ++-- 4 files changed, 17 insertions(+), 11 deletions(-)
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 60b5094fec04..ced1bb165bb6 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -312,7 +312,7 @@ They must be less than the value that KVM_CHECK_EXTENSION returns for the KVM_CAP_MULTI_ADDRESS_SPACE capability.
The bits in the dirty bitmap are cleared before the ioctl returns, unless -KVM_CAP_MANUAL_DIRTY_LOG_PROTECT is enabled. For more information, +KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is enabled. For more information, see the description of the capability.
@@ -3732,7 +3732,7 @@ the definition of struct kvm_nested_state, see KVM_GET_NESTED_STATE.
4.117 KVM_CLEAR_DIRTY_LOG (vm ioctl)
-Capability: KVM_CAP_MANUAL_DIRTY_LOG_PROTECT +Capability: KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 Architectures: x86 Type: vm ioctl Parameters: struct kvm_dirty_log (in) @@ -3765,10 +3765,10 @@ the address space for which you want to return the dirty bitmap. They must be less than the value that KVM_CHECK_EXTENSION returns for the KVM_CAP_MULTI_ADDRESS_SPACE capability.
-This ioctl is mostly useful when KVM_CAP_MANUAL_DIRTY_LOG_PROTECT +This ioctl is mostly useful when KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is enabled; for more information, see the description of the capability. However, it can always be used as long as KVM_CHECK_EXTENSION confirms -that KVM_CAP_MANUAL_DIRTY_LOG_PROTECT is present. +that KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is present.
5. The kvm_run structure @@ -4628,7 +4628,7 @@ With this capability, a guest may read the MSR_PLATFORM_INFO MSR. Otherwise, a #GP would be raised when the guest tries to access. Currently, this capability does not enable write permissions of this MSR for the guest.
-7.16 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT +7.16 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2
Architectures: all Parameters: args[0] whether feature should be enabled or not @@ -4651,6 +4651,11 @@ while userspace can see false reports of dirty pages. Manual reprotection helps reducing this time, improving guest performance and reducing the number of dirty log false positives.
+KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 was previously available under the name +KVM_CAP_MANUAL_DIRTY_LOG_PROTECT, but the implementation had bugs that make +it hard or impossible to use it correctly. The availability of +KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 signals that those bugs are fixed. +Userspace should not try to use KVM_CAP_MANUAL_DIRTY_LOG_PROTECT.
8. Other capabilities. ---------------------- diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index ddc13d703bfc..58c8ea239677 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -977,7 +977,8 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_INJECT_SERROR_ESR 158 #define KVM_CAP_MSR_PLATFORM_INFO 159 #define KVM_CAP_ARM_VM_IPA_SIZE 165 /* returns maximum IPA bits for a VM */ -#define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166 +#define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166 /* Obsolete */ +#define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 168 #define KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 174
#define KVM_CAP_ARM_CPU_FEATURE 555 @@ -1438,7 +1439,7 @@ struct kvm_enc_region { #define KVM_GET_NESTED_STATE _IOWR(KVMIO, 0xbe, struct kvm_nested_state) #define KVM_SET_NESTED_STATE _IOW(KVMIO, 0xbf, struct kvm_nested_state)
-/* Available with KVM_CAP_MANUAL_DIRTY_LOG_PROTECT */ +/* Available with KVM_CAP_MANUAL_DIRTY_LOG_PROTECT_2 */ #define KVM_CLEAR_DIRTY_LOG _IOWR(KVMIO, 0xc0, struct kvm_clear_dirty_log)
/* Secure Encrypted Virtualization command */ diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 378b86028e45..6a8f620e8472 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -231,7 +231,7 @@ int main(int argc, char *argv[]) int opt;
#ifdef USE_CLEAR_DIRTY_LOG - if (!kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT)) { + if (!kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2)) { fprintf(stderr, "KVM_CLEAR_DIRTY_LOG not available, skipping tests\n"); exit(KSFT_SKIP); } @@ -268,7 +268,7 @@ int main(int argc, char *argv[]) #ifdef USE_CLEAR_DIRTY_LOG struct kvm_enable_cap cap = {};
- cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT; + cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2; cap.args[0] = 1; vm_enable_cap(vm, &cap); #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 134895e26e3d..a441f9a75d1b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3271,7 +3271,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_CHECK_EXTENSION_VM: case KVM_CAP_ENABLE_CAP_VM: #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT - case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT: + case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: #endif return 1; #ifdef CONFIG_KVM_MMIO @@ -3303,7 +3303,7 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, { switch (cap->cap) { #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT - case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT: + case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: if (cap->flags || (cap->args[0] & ~1)) return -EINVAL; kvm->manual_dirty_log_protect = cap->args[0];
From: Jay Zhou jianjay.zhou@huawei.com
mainline inclusion from mainline-5.7 commit 3c9bd4006bfc2dccda1823db61b3f470ef91cfaa category: bugfix bugzilla: NA CVE: NA
-------------------------------------------------
It could take kvm->mmu_lock for an extended period of time when enabling dirty log for the first time. The main cost is to clear all the D-bits of last level SPTEs. This situation can benefit from manual dirty log protect as well, which can reduce the mmu_lock time taken. The sequence is like this:
1. Initialize all the bits of the dirty bitmap to 1 when enabling dirty log for the first time 2. Only write protect the huge pages 3. KVM_GET_DIRTY_LOG returns the dirty bitmap info 4. KVM_CLEAR_DIRTY_LOG will clear D-bit for each of the leaf level SPTEs gradually in small chunks
Under the Intel(R) Xeon(R) Gold 6152 CPU @ 2.10GHz environment, I did some tests with a 128G windows VM and counted the time taken of memory_global_dirty_log_start, here is the numbers:
VM Size Before After optimization 128G 460ms 10ms
Signed-off-by: Jay Zhou jianjay.zhou@huawei.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Xiangyou Xie xiexiangyou@huawei.com Reviewed-by: Ying Fang fangying1@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- Documentation/virtual/kvm/api.txt | 18 +++++++++++++++--- arch/x86/include/asm/kvm_host.h | 6 +++++- arch/x86/kvm/mmu.c | 7 ++++--- arch/x86/kvm/vmx.c | 3 ++- arch/x86/kvm/x86.c | 21 +++++++++++++++++---- include/linux/kvm_host.h | 11 ++++++++++- include/uapi/linux/kvm.h | 3 +++ virt/kvm/kvm_main.c | 24 +++++++++++++++++------- 8 files changed, 73 insertions(+), 20 deletions(-)
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index ced1bb165bb6..77252973790c 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -4633,8 +4633,13 @@ capability does not enable write permissions of this MSR for the guest. Architectures: all Parameters: args[0] whether feature should be enabled or not
-With this capability enabled, KVM_GET_DIRTY_LOG will not automatically -clear and write-protect all pages that are returned as dirty. +Valid flags are:: + + #define KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (1 << 0) + #define KVM_DIRTY_LOG_INITIALLY_SET (1 << 1) + +With KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE is set, KVM_GET_DIRTY_LOG will not +automatically clear and write-protect all pages that are returned as dirty. Rather, userspace will have to do this operation separately using KVM_CLEAR_DIRTY_LOG.
@@ -4645,12 +4650,19 @@ than requiring to sync a full memslot; this ensures that KVM does not take spinlocks for an extended period of time. Second, in some cases a large amount of time can pass between a call to KVM_GET_DIRTY_LOG and userspace actually using the data in the page. Pages can be modified -during this time, which is inefficint for both the guest and userspace: +during this time, which is inefficient for both the guest and userspace: the guest will incur a higher penalty due to write protection faults, while userspace can see false reports of dirty pages. Manual reprotection helps reducing this time, improving guest performance and reducing the number of dirty log false positives.
+With KVM_DIRTY_LOG_INITIALLY_SET set, all the bits of the dirty bitmap +will be initialized to 1 when created. This also improves performance because +dirty logging can be enabled gradually in small chunks on the first call +to KVM_CLEAR_DIRTY_LOG. KVM_DIRTY_LOG_INITIALLY_SET depends on +KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (it is also only available on +x86 for now). + KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 was previously available under the name KVM_CAP_MANUAL_DIRTY_LOG_PROTECT, but the implementation had bugs that make it hard or impossible to use it correctly. The availability of diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5ac2c33e9885..6b4f4032d5c0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -49,6 +49,9 @@
#define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS
+#define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ + KVM_DIRTY_LOG_INITIALLY_SET) + /* x86-specific vcpu->requests bit members */ #define KVM_REQ_MIGRATE_TIMER KVM_ARCH_REQ(0) #define KVM_REQ_REPORT_TPR_ACCESS KVM_ARCH_REQ(1) @@ -1196,7 +1199,8 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, - struct kvm_memory_slot *memslot); + struct kvm_memory_slot *memslot, + int start_level); void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5faa49a95ac9..5073aa2a7d11 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5701,13 +5701,14 @@ static bool slot_rmap_write_protect(struct kvm *kvm, }
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, - struct kvm_memory_slot *memslot) + struct kvm_memory_slot *memslot, + int start_level) { bool flush;
spin_lock(&kvm->mmu_lock); - flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect, - false); + flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, + start_level, PT_MAX_HUGEPAGE_LEVEL, false); spin_unlock(&kvm->mmu_lock);
/* diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d6bcbce6c15c..6ca72e3ee372 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -13833,7 +13833,8 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) static void vmx_slot_enable_log_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) { - kvm_mmu_slot_leaf_clear_dirty(kvm, slot); + if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) + kvm_mmu_slot_leaf_clear_dirty(kvm, slot); kvm_mmu_slot_largepage_remove_write_access(kvm, slot); }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2b45de3dc2ba..28a76ae738ca 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9344,7 +9344,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, { /* Still write protect RO slot */ if (new->flags & KVM_MEM_READONLY) { - kvm_mmu_slot_remove_write_access(kvm, new); + kvm_mmu_slot_remove_write_access(kvm, new, PT_PAGE_TABLE_LEVEL); return; }
@@ -9379,10 +9379,23 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * See the comments in fast_page_fault(). */ if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { - if (kvm_x86_ops->slot_enable_log_dirty) + if (kvm_x86_ops->slot_enable_log_dirty) { kvm_x86_ops->slot_enable_log_dirty(kvm, new); - else - kvm_mmu_slot_remove_write_access(kvm, new); + } else { + int level = + kvm_dirty_log_manual_protect_and_init_set(kvm) ? + PT_DIRECTORY_LEVEL : PT_PAGE_TABLE_LEVEL; + + /* + * If we're with initial-all-set, we don't need + * to write protect any small page because + * they're reported as dirty already. However + * we still need to write-protect huge pages + * so that the page split can happen lazily on + * the first write to the huge page. + */ + kvm_mmu_slot_remove_write_access(kvm, new, level); + } } else { if (kvm_x86_ops->slot_disable_log_dirty) kvm_x86_ops->slot_disable_log_dirty(kvm, new); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 214ae642022e..d10ba0ed63c2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -344,6 +344,10 @@ static inline unsigned long *kvm_second_dirty_bitmap(struct kvm_memory_slot *mem return memslot->dirty_bitmap + len / sizeof(*memslot->dirty_bitmap); }
+#ifndef KVM_DIRTY_LOG_MANUAL_CAPS +#define KVM_DIRTY_LOG_MANUAL_CAPS KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE +#endif + struct kvm_s390_adapter_int { u64 ind_addr; u64 summary_addr; @@ -477,7 +481,7 @@ struct kvm { #endif long tlbs_dirty; struct list_head devices; - bool manual_dirty_log_protect; + u64 manual_dirty_log_protect; struct dentry *debugfs_dentry; struct kvm_stat_data **debugfs_stat_data; struct srcu_struct srcu; @@ -511,6 +515,11 @@ struct kvm { #define vcpu_err(vcpu, fmt, ...) \ kvm_err("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__)
+static inline bool kvm_dirty_log_manual_protect_and_init_set(struct kvm *kvm) +{ + return !!(kvm->manual_dirty_log_protect & KVM_DIRTY_LOG_INITIALLY_SET); +} + static inline struct kvm_io_bus *kvm_get_bus(struct kvm *kvm, enum kvm_bus idx) { return srcu_dereference_check(kvm->buses[idx], &kvm->srcu, diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 58c8ea239677..9cda64507776 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1592,4 +1592,7 @@ struct kvm_hyperv_eventfd { #define KVM_HYPERV_CONN_ID_MASK 0x00ffffff #define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0)
+#define KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (1 << 0) +#define KVM_DIRTY_LOG_INITIALLY_SET (1 << 1) + #endif /* __LINUX_KVM_H */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a441f9a75d1b..9768ab63ef30 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -849,7 +849,7 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) * Allocation size is twice as large as the actual dirty bitmap size. * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed. */ -static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) +static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot) { unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
@@ -1076,8 +1076,11 @@ int __kvm_set_memory_region(struct kvm *kvm,
/* Allocate page dirty bitmap if needed */ if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { - if (kvm_create_dirty_bitmap(&new) < 0) + if (kvm_alloc_dirty_bitmap(&new) < 0) goto out_free; + + if (kvm_dirty_log_manual_protect_and_init_set(kvm)) + bitmap_set(new.dirty_bitmap, 0, new.npages); }
slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); @@ -3270,14 +3273,15 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_IOEVENTFD_ANY_LENGTH: case KVM_CAP_CHECK_EXTENSION_VM: case KVM_CAP_ENABLE_CAP_VM: -#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT - case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: -#endif return 1; #ifdef CONFIG_KVM_MMIO case KVM_CAP_COALESCED_MMIO: return KVM_COALESCED_MMIO_PAGE_OFFSET; #endif +#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT + case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: + return KVM_DIRTY_LOG_MANUAL_CAPS; +#endif #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING case KVM_CAP_IRQ_ROUTING: return KVM_MAX_IRQ_ROUTES; @@ -3303,11 +3307,17 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, { switch (cap->cap) { #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT - case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: - if (cap->flags || (cap->args[0] & ~1)) + case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: { + u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE; + + if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE) + allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS; + + if (cap->flags || (cap->args[0] & ~allowed_options)) return -EINVAL; kvm->manual_dirty_log_protect = cap->args[0]; return 0; + } #endif default: return kvm_vm_ioctl_enable_cap(kvm, cap);
From: Keqian Zhu zhukeqian1@huawei.com
mainline inclusion from mainline-5.8 commit c862626e19efdc26b26481515470b160e8fe52f3 category: feature bugzilla: NA CVE: NA
-------------------------------------------------
There is already support of enabling dirty log gradually in small chunks for x86 in commit 3c9bd4006bfc ("KVM: x86: enable dirty log gradually in small chunks"). This adds support for arm64.
x86 still writes protect all huge pages when DIRTY_LOG_INITIALLY_ALL_SET is enabled. However, for arm64, both huge pages and normal pages can be write protected gradually by userspace.
Under the Huawei Kunpeng 920 2.6GHz platform, I did some tests on 128G Linux VMs with different page size. The memory pressure is 127G in each case. The time taken of memory_global_dirty_log_start in QEMU is listed below:
Page Size Before After Optimization 4K 650ms 1.8ms 2M 4ms 1.8ms 1G 2ms 1.8ms
Besides the time reduction, the biggest improvement is that we will minimize the performance side effect (because of dissolving huge pages and marking memslots dirty) on guest after enabling dirty log.
Signed-off-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Marc Zyngier maz@kernel.org Signed-off-by: Xiangyou Xie xiexiangyou@huawei.com Reviewed-by: Ying Fang fangying1@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Link: https://lore.kernel.org/r/20200413122023.52583-1-zhukeqian1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- Documentation/virtual/kvm/api.txt | 2 +- arch/arm64/include/asm/kvm_host.h | 3 +++ virt/kvm/arm/mmu.c | 12 ++++++++++-- 3 files changed, 14 insertions(+), 3 deletions(-)
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 77252973790c..33beea9db4ae 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -4661,7 +4661,7 @@ will be initialized to 1 when created. This also improves performance because dirty logging can be enabled gradually in small chunks on the first call to KVM_CLEAR_DIRTY_LOG. KVM_DIRTY_LOG_INITIALLY_SET depends on KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (it is also only available on -x86 for now). +x86 and arm64 for now).
KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 was previously available under the name KVM_CAP_MANUAL_DIRTY_LOG_PROTECT, but the implementation had bugs that make diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index b3ed490a50f1..34aba7d53d10 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -53,6 +53,9 @@ #define KVM_REQ_IRQ_PENDING KVM_ARCH_REQ(1) #define KVM_REQ_VCPU_RESET KVM_ARCH_REQ(2)
+#define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ + KVM_DIRTY_LOG_INITIALLY_SET) + DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
int __attribute_const__ kvm_target_cpu(void); diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index f110566823c9..aef836a93086 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -2309,8 +2309,16 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, * allocated dirty_bitmap[], dirty pages will be be tracked while the * memory slot is write protected. */ - if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) - kvm_mmu_wp_memory_region(kvm, mem->slot); + if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { + /* + * If we're with initial-all-set, we don't need to write + * protect any pages because they're all reported as dirty. + * Huge pages and normal pages will be write protect gradually. + */ + if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) { + kvm_mmu_wp_memory_region(kvm, mem->slot); + } + } }
int kvm_arch_prepare_memory_region(struct kvm *kvm,
From: Xiangyou Xie xiexiangyou@huawei.com
euleros inclusion category: feature bugzilla: NA CVE: NA
------------------------------------------------- Keep kvm.h file same with the file in kernel, avoid compile Warning when compiling perf tools.
Signed-off-by: Xiangyou Xie xiexiangyou@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/arch/arm/include/uapi/asm/kvm.h | 4 +- tools/arch/arm64/include/uapi/asm/kvm.h | 4 +- tools/arch/x86/include/uapi/asm/kvm.h | 1 + tools/include/uapi/linux/kvm.h | 50 ++++++++++++++++++++++++- 4 files changed, 55 insertions(+), 4 deletions(-)
diff --git a/tools/arch/arm/include/uapi/asm/kvm.h b/tools/arch/arm/include/uapi/asm/kvm.h index 4602464ebdfb..86db092e4c2f 100644 --- a/tools/arch/arm/include/uapi/asm/kvm.h +++ b/tools/arch/arm/include/uapi/asm/kvm.h @@ -254,8 +254,10 @@ struct kvm_vcpu_events { #define KVM_DEV_ARM_ITS_CTRL_RESET 4
/* KVM_IRQ_LINE irq field index values */ +#define KVM_ARM_IRQ_VCPU2_SHIFT 28 +#define KVM_ARM_IRQ_VCPU2_MASK 0xf #define KVM_ARM_IRQ_TYPE_SHIFT 24 -#define KVM_ARM_IRQ_TYPE_MASK 0xff +#define KVM_ARM_IRQ_TYPE_MASK 0xf #define KVM_ARM_IRQ_VCPU_SHIFT 16 #define KVM_ARM_IRQ_VCPU_MASK 0xff #define KVM_ARM_IRQ_NUM_SHIFT 0 diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h index 97c3478ee6e7..307d13612399 100644 --- a/tools/arch/arm64/include/uapi/asm/kvm.h +++ b/tools/arch/arm64/include/uapi/asm/kvm.h @@ -265,8 +265,10 @@ struct kvm_vcpu_events { #define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1
/* KVM_IRQ_LINE irq field index values */ +#define KVM_ARM_IRQ_VCPU2_SHIFT 28 +#define KVM_ARM_IRQ_VCPU2_MASK 0xf #define KVM_ARM_IRQ_TYPE_SHIFT 24 -#define KVM_ARM_IRQ_TYPE_MASK 0xff +#define KVM_ARM_IRQ_TYPE_MASK 0xf #define KVM_ARM_IRQ_VCPU_SHIFT 16 #define KVM_ARM_IRQ_VCPU_MASK 0xff #define KVM_ARM_IRQ_NUM_SHIFT 0 diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index fd23d5778ea1..f1645578d9d0 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -378,6 +378,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) #define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) #define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2) +#define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3)
#define KVM_STATE_NESTED_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_RUN_PENDING 0x00000002 diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 251be353f950..9cda64507776 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -189,9 +189,11 @@ struct kvm_hyperv_exit { #define KVM_EXIT_HYPERV_SYNIC 1 #define KVM_EXIT_HYPERV_HCALL 2 __u32 type; + __u32 pad1; union { struct { __u32 msr; + __u32 pad2; __u64 control; __u64 evt_page; __u64 msg_page; @@ -486,6 +488,17 @@ struct kvm_dirty_log { }; };
+/* for KVM_CLEAR_DIRTY_LOG */ +struct kvm_clear_dirty_log { + __u32 slot; + __u32 num_pages; + __u64 first_page; + union { + void __user *dirty_bitmap; /* one bit per page */ + __u64 padding2; + }; +}; + /* for KVM_SET_SIGNAL_MASK */ struct kvm_signal_mask { __u32 len; @@ -744,12 +757,22 @@ struct kvm_ppc_resize_hpt { #define KVM_VM_PPC_HV 1 #define KVM_VM_PPC_PR 2
-/* on MIPS, 0 forces trap & emulate, 1 forces VZ ASE */ -#define KVM_VM_MIPS_TE 0 +/* on MIPS, 0 indicates auto, 1 forces VZ ASE, 2 forces trap & emulate */ +#define KVM_VM_MIPS_AUTO 0 #define KVM_VM_MIPS_VZ 1 +#define KVM_VM_MIPS_TE 2
#define KVM_S390_SIE_PAGE_OFFSET 1
+/* + * On arm64, machine type can be used to request the physical + * address size for the VM. Bits[7-0] are reserved for the guest + * PA size shift (i.e, log2(PA_Size)). For backward compatibility, + * value 0 implies the default IPA size, 40bits. + */ +#define KVM_VM_TYPE_ARM_IPA_SIZE_MASK 0xffULL +#define KVM_VM_TYPE_ARM_IPA_SIZE(x) \ + ((x) & KVM_VM_TYPE_ARM_IPA_SIZE_MASK) /* * ioctls for /dev/kvm fds: */ @@ -953,6 +976,12 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_NESTED_STATE 157 #define KVM_CAP_ARM_INJECT_SERROR_ESR 158 #define KVM_CAP_MSR_PLATFORM_INFO 159 +#define KVM_CAP_ARM_VM_IPA_SIZE 165 /* returns maximum IPA bits for a VM */ +#define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166 /* Obsolete */ +#define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 168 +#define KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 174 + +#define KVM_CAP_ARM_CPU_FEATURE 555
#ifdef KVM_CAP_IRQ_ROUTING
@@ -1184,6 +1213,17 @@ struct kvm_vfio_spapr_tce { __s32 tablefd; };
+#define ID_REG_MAX_NUMS 64 +struct id_reg_info { + __u64 sys_id; + __u64 sys_val; +}; + +struct id_registers { + struct id_reg_info regs[ID_REG_MAX_NUMS]; + __u64 num; +}; + /* * ioctls for VM fds */ @@ -1399,6 +1439,9 @@ struct kvm_enc_region { #define KVM_GET_NESTED_STATE _IOWR(KVMIO, 0xbe, struct kvm_nested_state) #define KVM_SET_NESTED_STATE _IOW(KVMIO, 0xbf, struct kvm_nested_state)
+/* Available with KVM_CAP_MANUAL_DIRTY_LOG_PROTECT_2 */ +#define KVM_CLEAR_DIRTY_LOG _IOWR(KVMIO, 0xc0, struct kvm_clear_dirty_log) + /* Secure Encrypted Virtualization command */ enum sev_cmd_id { /* Guest initialization commands */ @@ -1549,4 +1592,7 @@ struct kvm_hyperv_eventfd { #define KVM_HYPERV_CONN_ID_MASK 0x00ffffff #define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0)
+#define KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (1 << 0) +#define KVM_DIRTY_LOG_INITIALLY_SET (1 << 1) + #endif /* __LINUX_KVM_H */