From: Xiangyou Xie xiexiangyou@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I912VN
-------------------------------------------------
In guest, Before entering the real idle, polling for a while. if the current task is set TIF_NEED_RESCHED during the polling process, it will immediately break from the polling loop.
The polling time poll_threshold_ns can be adjusted by sysctl to avoid 100% of the CPU usage in host. This value can be adjusted according to the requirements.
This optimization needs to support _TIF_POLLING_NRFLAG, that can optimize the overhead of ttwu IPI. Wakeup response delay reduced from 4us to 1us.
Signed-off-by: Xiangyou Xie xiexiangyou@huawei.com Signed-off-by: liangtian liangtian13@huawei.com --- include/linux/kernel.h | 1 + kernel/sched/idle.c | 30 +++++++++++++++++++++++++++++- kernel/sysctl.c | 7 +++++++ 3 files changed, 37 insertions(+), 1 deletion(-)
diff --git a/include/linux/kernel.h b/include/linux/kernel.h index cee8fe87e9f4..d0ac98b7d656 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -220,6 +220,7 @@ extern void bust_spinlocks(int yes); extern int root_mountflags;
extern bool early_boot_irqs_disabled; +extern unsigned long poll_threshold_ns;
/* * Values used for system_state. Ordering of the states must not be changed diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 5007b25c5bc6..d09a1ff1c87a 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -10,6 +10,12 @@ /* Linker adds these: start and end of __cpuidle functions */ extern char __cpuidle_text_start[], __cpuidle_text_end[];
+/* + * Poll_threshold_ns indicates the maximum polling time before + * entering real idle. + */ +unsigned long poll_threshold_ns; + /** * sched_idle_set_state - Record idle state for the current CPU. * @idle_state: State to record. @@ -49,6 +55,24 @@ static int __init cpu_idle_nopoll_setup(char *__unused) __setup("hlt", cpu_idle_nopoll_setup); #endif
+static void smart_idle_poll(void) +{ + unsigned long poll_duration = poll_threshold_ns; + ktime_t cur, stop; + + if (!poll_duration) + return; + + stop = ktime_add_ns(ktime_get(), poll_duration); + + do { + cpu_relax(); + if (tif_need_resched()) + break; + cur = ktime_get(); + } while (ktime_before(cur, stop)); +} + static noinline int __cpuidle cpu_idle_poll(void) { instrumentation_begin(); @@ -56,6 +80,7 @@ static noinline int __cpuidle cpu_idle_poll(void) stop_critical_timings(); ct_cpuidle_enter();
+ smart_idle_poll(); raw_local_irq_enable(); while (!tif_need_resched() && (cpu_idle_force_poll || tick_check_broadcast_expired())) @@ -237,6 +262,7 @@ static void cpuidle_idle_call(void) static void do_idle(void) { int cpu = smp_processor_id(); + unsigned long idle_poll_flag = poll_threshold_ns;
/* * Check if we need to update blocked load @@ -275,9 +301,11 @@ static void do_idle(void) * broadcast device expired for us, we don't want to go deep * idle as we know that the IPI is going to arrive right away. */ - if (cpu_idle_force_poll || tick_check_broadcast_expired()) { + if (cpu_idle_force_poll || tick_check_broadcast_expired() || + idle_poll_flag) { tick_nohz_idle_restart_tick(); cpu_idle_poll(); + idle_poll_flag = 0; } else { cpuidle_idle_call(); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e84df0818517..cdc117677647 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2025,6 +2025,13 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "halt_poll_threshold", + .data = &poll_threshold_ns, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, #ifdef CONFIG_TREE_RCU { .procname = "panic_on_rcu_stall",