
From: Zhiqiang Ni <nizhiqiang1@huawei.com> virt inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/IBNSBL -------------------------------- KVM will mark vCPU preempted whenever kvm_arch_vcpu_put() is called. This will make guest scheduler treat it as an unavailable idle cpu even if this vCPU is running on the physical CPU exclusively. This will result in lots of harmful cpu migrations and hurt e.g. guest java gc's performance. So average guest steal time should be taken into account when trying to mark vCPU preempted. If it's negligible, we can safely assume vCPU is running in the physical CPU exclusively and vCPU should not be marked preempted in this case to mitigate the issue. Signed-off-by: Zhiqiang Ni <nizhiqiang1@huawei.com> Signed-off-by: Dongxu Sun <sundongxu3@huawei.com> --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/arm.c | 20 ++++++++++++++++++-- arch/arm64/kvm/pvtime.c | 8 ++++++++ include/trace/events/kvm.h | 24 ++++++++++++++++++++++++ 4 files changed, 51 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index f171ab3d0d37..829e9b232ecf 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -407,6 +407,7 @@ struct kvm_vcpu_arch { struct { u64 last_steal; gpa_t base; + u64 avg_steal; } steal; /* Guest PV sched state */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 240edaa9eb50..94e0ec08a757 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -94,6 +94,13 @@ static const struct kernel_param_ops force_wfi_trap_ops = { bool force_wfi_trap; module_param_cb(force_wfi_trap, &force_wfi_trap_ops, &force_wfi_trap, 0644); +/* + * Set guest_steal_time_thresh to 0 to effectively disable this feature. + * Note 1024 should be a good guess as it works fine in the real workload. + */ +static unsigned long __read_mostly guest_steal_time_thresh = 1024; +module_param(guest_steal_time_thresh, ulong, 0644); + static int vcpu_req_reload_wfi_traps(const char *val, const struct kernel_param *kp) { struct kvm *kvm; @@ -600,8 +607,17 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) vcpu->cpu = -1; - if (kvm_arm_is_pvsched_enabled(&vcpu->arch)) - kvm_update_pvsched_preempted(vcpu, 1); + if (kvm_arm_is_pvsched_enabled(&vcpu->arch)) { + if (vcpu->arch.steal.avg_steal < guest_steal_time_thresh) { + kvm_update_pvsched_preempted(vcpu, 0); + trace_kvm_arm_set_vcpu_preempted(vcpu->vcpu_id, + vcpu->arch.steal.avg_steal, guest_steal_time_thresh, 0); + } else { + kvm_update_pvsched_preempted(vcpu, 1); + trace_kvm_arm_set_vcpu_preempted(vcpu->vcpu_id, + vcpu->arch.steal.avg_steal, guest_steal_time_thresh, 1); + } + } #ifdef CONFIG_KVM_HISI_VIRT kvm_hisi_dvmbm_put(vcpu); diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c index 920ac43077ad..9b6448e1b300 100644 --- a/arch/arm64/kvm/pvtime.c +++ b/arch/arm64/kvm/pvtime.c @@ -19,6 +19,14 @@ void kvm_update_stolen_time(struct kvm_vcpu *vcpu) u64 steal = 0; int idx; + /* + * Because workloads change over time, we keep avg_steal as a floating + * average which ends up weighing recent steal time more than old ones. + */ + vcpu->arch.steal.avg_steal += + READ_ONCE(current->sched_info.run_delay) - vcpu->arch.steal.last_steal; + vcpu->arch.steal.avg_steal /= 2; + if (base == GPA_INVALID) return; diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 167e26659a0b..458a2a554f4d 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -504,6 +504,30 @@ TRACE_EVENT(kvm_test_age_hva, TP_printk("mmu notifier test age hva: %#016lx", __entry->hva) ); +TRACE_EVENT(kvm_arm_set_vcpu_preempted, + TP_PROTO(unsigned int vcpu_id, u64 avg_steal, unsigned long thresh, + unsigned int update_preempted_value), + TP_ARGS(vcpu_id, avg_steal, thresh, update_preempted_value), + + TP_STRUCT__entry( + __field(unsigned int, vcpu_id) + __field(unsigned long long, avg_steal) + __field(unsigned long, thresh) + __field(unsigned int, update_preempted_value) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->avg_steal = avg_steal; + __entry->thresh = thresh; + __entry->update_preempted_value = update_preempted_value; + ), + + TP_printk("vcpu:%u avg steal time:%llu thresh:%lu update_preempted_value:%u", + __entry->vcpu_id, __entry->avg_steal, __entry->thresh, + __entry->update_preempted_value) +); + #endif /* _TRACE_KVM_MAIN_H */ /* This part must be outside protection */ -- 2.33.0