[PATCH OLK-5.10 0/2] KVM: arm64: pvsched bugfix and module param

Yanan Wang (1): KVM:arm64:Add a module param to enable/disable pv_preempted dynamically Zhiqiang Ni (1): KVM: ARM: mark vCPU preempted if average guest steal time is considerable arch/arm64/include/asm/kvm_host.h | 6 ++++- arch/arm64/kvm/arm.c | 45 ++++++++++++++++++++++++++++--- arch/arm64/kvm/pvsched.c | 2 ++ arch/arm64/kvm/pvtime.c | 8 ++++++ include/trace/events/kvm.h | 24 +++++++++++++++++ 5 files changed, 80 insertions(+), 5 deletions(-) -- 2.33.0

From: Zhiqiang Ni <nizhiqiang1@huawei.com> virt inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/IBNSBL -------------------------------- KVM will mark vCPU preempted whenever kvm_arch_vcpu_put() is called. This will make guest scheduler treat it as an unavailable idle cpu even if this vCPU is running on the physical CPU exclusively. This will result in lots of harmful cpu migrations and hurt e.g. guest java gc's performance. So average guest steal time should be taken into account when trying to mark vCPU preempted. If it's negligible, we can safely assume vCPU is running in the physical CPU exclusively and vCPU should not be marked preempted in this case to mitigate the issue. Signed-off-by: Zhiqiang Ni <nizhiqiang1@huawei.com> Signed-off-by: Dongxu Sun <sundongxu3@huawei.com> --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/arm.c | 20 ++++++++++++++++++-- arch/arm64/kvm/pvtime.c | 8 ++++++++ include/trace/events/kvm.h | 24 ++++++++++++++++++++++++ 4 files changed, 51 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index f171ab3d0d37..829e9b232ecf 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -407,6 +407,7 @@ struct kvm_vcpu_arch { struct { u64 last_steal; gpa_t base; + u64 avg_steal; } steal; /* Guest PV sched state */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 240edaa9eb50..94e0ec08a757 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -94,6 +94,13 @@ static const struct kernel_param_ops force_wfi_trap_ops = { bool force_wfi_trap; module_param_cb(force_wfi_trap, &force_wfi_trap_ops, &force_wfi_trap, 0644); +/* + * Set guest_steal_time_thresh to 0 to effectively disable this feature. + * Note 1024 should be a good guess as it works fine in the real workload. + */ +static unsigned long __read_mostly guest_steal_time_thresh = 1024; +module_param(guest_steal_time_thresh, ulong, 0644); + static int vcpu_req_reload_wfi_traps(const char *val, const struct kernel_param *kp) { struct kvm *kvm; @@ -600,8 +607,17 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) vcpu->cpu = -1; - if (kvm_arm_is_pvsched_enabled(&vcpu->arch)) - kvm_update_pvsched_preempted(vcpu, 1); + if (kvm_arm_is_pvsched_enabled(&vcpu->arch)) { + if (vcpu->arch.steal.avg_steal < guest_steal_time_thresh) { + kvm_update_pvsched_preempted(vcpu, 0); + trace_kvm_arm_set_vcpu_preempted(vcpu->vcpu_id, + vcpu->arch.steal.avg_steal, guest_steal_time_thresh, 0); + } else { + kvm_update_pvsched_preempted(vcpu, 1); + trace_kvm_arm_set_vcpu_preempted(vcpu->vcpu_id, + vcpu->arch.steal.avg_steal, guest_steal_time_thresh, 1); + } + } #ifdef CONFIG_KVM_HISI_VIRT kvm_hisi_dvmbm_put(vcpu); diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c index 920ac43077ad..9b6448e1b300 100644 --- a/arch/arm64/kvm/pvtime.c +++ b/arch/arm64/kvm/pvtime.c @@ -19,6 +19,14 @@ void kvm_update_stolen_time(struct kvm_vcpu *vcpu) u64 steal = 0; int idx; + /* + * Because workloads change over time, we keep avg_steal as a floating + * average which ends up weighing recent steal time more than old ones. + */ + vcpu->arch.steal.avg_steal += + READ_ONCE(current->sched_info.run_delay) - vcpu->arch.steal.last_steal; + vcpu->arch.steal.avg_steal /= 2; + if (base == GPA_INVALID) return; diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 167e26659a0b..458a2a554f4d 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -504,6 +504,30 @@ TRACE_EVENT(kvm_test_age_hva, TP_printk("mmu notifier test age hva: %#016lx", __entry->hva) ); +TRACE_EVENT(kvm_arm_set_vcpu_preempted, + TP_PROTO(unsigned int vcpu_id, u64 avg_steal, unsigned long thresh, + unsigned int update_preempted_value), + TP_ARGS(vcpu_id, avg_steal, thresh, update_preempted_value), + + TP_STRUCT__entry( + __field(unsigned int, vcpu_id) + __field(unsigned long long, avg_steal) + __field(unsigned long, thresh) + __field(unsigned int, update_preempted_value) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->avg_steal = avg_steal; + __entry->thresh = thresh; + __entry->update_preempted_value = update_preempted_value; + ), + + TP_printk("vcpu:%u avg steal time:%llu thresh:%lu update_preempted_value:%u", + __entry->vcpu_id, __entry->avg_steal, __entry->thresh, + __entry->update_preempted_value) +); + #endif /* _TRACE_KVM_MAIN_H */ /* This part must be outside protection */ -- 2.33.0

From: Yanan Wang <wangyanan55@huawei.com> virt inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/IBNSBL -------------------------------- pv_preempted can not ensure performance improvement in any scenario, so add a module param to enable/disable pv_preempted dynamically if we don't need it. Signed-off-by: Yanan Wang <wangyanan55@huawei.com> Signed-off-by: Dongxu Sun <sundongxu3@huawei.com> --- arch/arm64/include/asm/kvm_host.h | 5 ++++- arch/arm64/kvm/arm.c | 27 ++++++++++++++++++++++++--- arch/arm64/kvm/pvsched.c | 2 ++ 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 829e9b232ecf..b299d2d57085 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -413,6 +413,7 @@ struct kvm_vcpu_arch { /* Guest PV sched state */ struct { bool pv_unhalted; + bool preempted; gpa_t base; } pvsched; @@ -646,12 +647,14 @@ long kvm_hypercall_pvsched_features(struct kvm_vcpu *vcpu); void kvm_update_pvsched_preempted(struct kvm_vcpu *vcpu, u32 preempted); long kvm_pvsched_kick_vcpu(struct kvm_vcpu *vcpu); +extern bool pv_preempted_enable; static inline void kvm_arm_pvsched_vcpu_init(struct kvm_vcpu_arch *vcpu_arch) { vcpu_arch->pvsched.base = GPA_INVALID; + vcpu_arch->pvsched.preempted = false; } -static inline bool kvm_arm_is_pvsched_enabled(struct kvm_vcpu_arch *vcpu_arch) +static inline bool kvm_arm_is_pvsched_valid(struct kvm_vcpu_arch *vcpu_arch) { return (vcpu_arch->pvsched.base != GPA_INVALID); } diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 94e0ec08a757..c37f6238c901 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -84,6 +84,15 @@ unsigned int twedel = 0; module_param(twedel, uint, S_IRUGO | S_IWUSR); #endif +static const struct kernel_param_ops pv_preempted_enable_ops = { + .set = param_set_bool, + .get = param_get_bool, +}; + +bool pv_preempted_enable = true; +MODULE_PARM_DESC(pv_preempted_enable, "bool"); +module_param_cb(pv_preempted_enable, &pv_preempted_enable_ops, &pv_preempted_enable, 0644); + static int vcpu_req_reload_wfi_traps(const char *val, const struct kernel_param *kp); static const struct kernel_param_ops force_wfi_trap_ops = { @@ -582,8 +591,20 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (vcpu_has_ptrauth(vcpu)) vcpu_ptrauth_disable(vcpu); - if (kvm_arm_is_pvsched_enabled(&vcpu->arch)) - kvm_update_pvsched_preempted(vcpu, 0); + /* + * When pv_preempted is changed from enabled to disabled, preempted + * state will not be updated in kvm_arch_vcpu_put/load. So we must + * update the preempted state to 0 for every vCPU in case some vCPUs' + * preempted state will always be 1. + */ + if (kvm_arm_is_pvsched_valid(&vcpu->arch)) { + if (pv_preempted_enable) + kvm_update_pvsched_preempted(vcpu, 0); + else { + if (vcpu->arch.pvsched.preempted) + kvm_update_pvsched_preempted(vcpu, 0); + } + } #ifdef CONFIG_KVM_HISI_VIRT kvm_hisi_dvmbm_load(vcpu); @@ -607,7 +628,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) vcpu->cpu = -1; - if (kvm_arm_is_pvsched_enabled(&vcpu->arch)) { + if (kvm_arm_is_pvsched_valid(&vcpu->arch) && pv_preempted_enable) { if (vcpu->arch.steal.avg_steal < guest_steal_time_thresh) { kvm_update_pvsched_preempted(vcpu, 0); trace_kvm_arm_set_vcpu_preempted(vcpu->vcpu_id, diff --git a/arch/arm64/kvm/pvsched.c b/arch/arm64/kvm/pvsched.c index dc1768815467..9693415226d1 100644 --- a/arch/arm64/kvm/pvsched.c +++ b/arch/arm64/kvm/pvsched.c @@ -34,6 +34,8 @@ void kvm_update_pvsched_preempted(struct kvm_vcpu *vcpu, u32 preempted) srcu_read_unlock(&kvm->srcu, idx); pagefault_enable(); + + vcpu->arch.pvsched.preempted = !!preempted; } long kvm_pvsched_kick_vcpu(struct kvm_vcpu *vcpu) -- 2.33.0

反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/15260 邮件列表地址:https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/FV6... FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/15260 Mailing list address: https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/FV6...
participants (2)
-
Dongxu Sun
-
patchwork bot