From: Xiangyou Xie xiexiangyou@huawei.com
hulk inclusion category: feature bugzilla: 176961 https://gitee.com/openeuler/kernel/issues/I4E05T
-------------------------------------------------
In guest, Before entering the real idle, polling for a while. if the current task is set TIF_NEED_RESCHED during the polling process, it will immediately break from the polling loop.
The polling time poll_threshold_ns can be adjusted by sysctl to avoid 100% of the CPU usage in host. This value can be adjusted according to the requirements.
This optimization needs to support _TIF_POLLING_NRFLAG, that can optimize the overhead of ttwu IPI. Wakeup response delay reduced from 4us to 1us.
Signed-off-by: Xiangyou Xie xiexiangyou@huawei.com Signed-off-by: Li Hua hucool.lihua@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Chen Jun chenjun102@huawei.com --- include/linux/kernel.h | 2 ++ kernel/sched/idle.c | 30 +++++++++++++++++++++++++++++- kernel/sysctl.c | 7 +++++++ 3 files changed, 38 insertions(+), 1 deletion(-)
diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 0b809c92d817..63555b947d96 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -555,6 +555,8 @@ extern int sysctl_panic_on_rcu_stall; extern int sysctl_panic_on_stackoverflow;
extern bool crash_kexec_post_notifiers; +extern unsigned long poll_threshold_ns; +
/* * panic_cpu is used for synchronizing panic() and crash_kexec() execution. It diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 36b545f17206..0bd53051ac71 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -13,6 +13,12 @@ /* Linker adds these: start and end of __cpuidle functions */ extern char __cpuidle_text_start[], __cpuidle_text_end[];
+/* + * Poll_threshold_ns indicates the maximum polling time before + * entering real idle. + */ +unsigned long poll_threshold_ns; + /** * sched_idle_set_state - Record idle state for the current CPU. * @idle_state: State to record. @@ -52,12 +58,31 @@ static int __init cpu_idle_nopoll_setup(char *__unused) __setup("hlt", cpu_idle_nopoll_setup); #endif
+static void smart_idle_poll(void) +{ + unsigned long poll_duration = poll_threshold_ns; + ktime_t cur, stop; + + if (!poll_duration) + return; + + stop = ktime_add_ns(ktime_get(), poll_duration); + + do { + cpu_relax(); + if (tif_need_resched()) + break; + cur = ktime_get(); + } while (ktime_before(cur, stop)); +} + static noinline int __cpuidle cpu_idle_poll(void) { trace_cpu_idle(0, smp_processor_id()); stop_critical_timings(); rcu_idle_enter(); local_irq_enable(); + smart_idle_poll();
while (!tif_need_resched() && (cpu_idle_force_poll || tick_check_broadcast_expired())) @@ -261,6 +286,7 @@ static void cpuidle_idle_call(void) static void do_idle(void) { int cpu = smp_processor_id(); + unsigned long idle_poll_flag = poll_threshold_ns; /* * If the arch has a polling bit, we maintain an invariant: * @@ -293,9 +319,11 @@ static void do_idle(void) * broadcast device expired for us, we don't want to go deep * idle as we know that the IPI is going to arrive right away. */ - if (cpu_idle_force_poll || tick_check_broadcast_expired()) { + if (cpu_idle_force_poll || tick_check_broadcast_expired() || + idle_poll_flag) { tick_nohz_idle_restart_tick(); cpu_idle_poll(); + idle_poll_flag = 0; } else { cpuidle_idle_call(); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 513cb217232a..261388afe482 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1849,6 +1849,13 @@ static struct ctl_table kern_table[] = { .proc_handler = sysctl_sched_uclamp_handler, }, #endif + { + .procname = "halt_poll_threshold", + .data = &poll_threshold_ns, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, #ifdef CONFIG_SCHED_AUTOGROUP { .procname = "sched_autogroup_enabled",