From: "Peter Zijlstra (Intel)" <peterz@infradead.org> stable inclusion from stable-v6.6.120 commit 51445190c10a36d292e70db085d0fb6cc3bec94f category: perf bugzilla: https://atomgit.com/openeuler/kernel/issues/8555 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=... -------------------------------- commit 33cf66d88306663d16e4759e9d24766b0aaa2e17 upstream. Add a randomized algorithm that runs newidle balancing proportional to its success rate. This improves schbench significantly: 6.18-rc4: 2.22 Mrps/s 6.18-rc4+revert: 2.04 Mrps/s 6.18-rc4+revert+random: 2.18 Mrps/S Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%: 6.17: -6% 6.17+revert: 0% 6.17+revert+random: -1% Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com> Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com> Tested-by: Chris Mason <clm@meta.com> Link: https://lkml.kernel.org/r/6825c50d-7fa7-45d8-9b81-c6e7e25738e2@meta.com Link: https://patch.msgid.link/20251107161739.770122091@infradead.org [ Ajay: Modified to apply on v6.6 ] Signed-off-by: Ajay Kaher <ajay.kaher@broadcom.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Conflicts: kernel/sched/core.c kernel/sched/fair.c [context conflicts] Signed-off-by: Chen Jinghuang <chenjinghuang2@huawei.com> --- include/linux/sched/topology.h | 3 +++ kernel/sched/core.c | 3 +++ kernel/sched/fair.c | 44 ++++++++++++++++++++++++++++++---- kernel/sched/features.h | 5 ++++ kernel/sched/sched.h | 7 ++++++ kernel/sched/topology.c | 6 +++++ 6 files changed, 64 insertions(+), 4 deletions(-) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 7eee852aa384..95e5d7772800 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -130,6 +130,9 @@ struct sched_domain { unsigned int nr_balance_failed; /* initialise to 0 */ /* idle_balance() stats */ + unsigned int newidle_call; + unsigned int newidle_success; + unsigned int newidle_ratio; u64 max_newidle_lb_cost; unsigned long last_decay_max_lb_cost; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 69d10fdb84d8..cf45ea94bb05 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -119,6 +119,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +DEFINE_PER_CPU(struct rnd_state, sched_rnd_state); #ifdef CONFIG_QOS_SCHED static void sched_change_qos_group(struct task_struct *tsk, struct task_group *tg); @@ -9976,6 +9977,8 @@ void __init sched_init_smp(void) sched_init_numa(NUMA_NO_NODE); set_sched_cluster(); + prandom_init_once(&sched_rnd_state); + /* * There's no userspace yet to cause hotplug operations; hence all the * CPU masks are stable and all blatant races in the below code cannot diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a4592d35da8b..d16c65d5fa34 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -13651,8 +13651,24 @@ void update_max_interval(void) max_load_balance_interval = HZ*num_online_cpus()/10; } -static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) +static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success) { + sd->newidle_call++; + sd->newidle_success += success; + + if (sd->newidle_call >= 1024) { + sd->newidle_ratio = sd->newidle_success; + sd->newidle_call /= 2; + sd->newidle_success /= 2; + } +} + +static inline bool +update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success) +{ + if (cost) + update_newidle_stats(sd, success); + if (cost > sd->max_newidle_lb_cost) { /* * Track max cost of a domain to make sure to not delay the @@ -13700,7 +13716,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) * Decay the newidle max times here because this is a regular * visit to all the domains. */ - need_decay = update_newidle_cost(sd, 0); + need_decay = update_newidle_cost(sd, 0, 0); max_cost += sd->max_newidle_lb_cost; /* @@ -14336,6 +14352,22 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) break; if (sd->flags & SD_BALANCE_NEWIDLE) { + unsigned int weight = 1; + + if (sched_feat(NI_RANDOM)) { + /* + * Throw a 1k sided dice; and only run + * newidle_balance according to the success + * rate. + */ + u32 d1k = sched_rng() % 1024; + weight = 1 + sd->newidle_ratio; + if (d1k > weight) { + update_newidle_stats(sd, 0); + continue; + } + weight = (1024 + weight/2) / weight; + } pulled_task = load_balance(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, @@ -14343,10 +14375,14 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) t1 = sched_clock_cpu(this_cpu); domain_cost = t1 - t0; - update_newidle_cost(sd, domain_cost); - curr_cost += domain_cost; t0 = t1; + + /* + * Track max cost of a domain to make sure to not delay the + * next wakeup on the CPU. + */ + update_newidle_cost(sd, domain_cost, weight * !!pulled_task); } /* diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 52ea0097c513..6cd6b5307430 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -106,6 +106,11 @@ SCHED_FEAT(UTIL_EST_FASTUP, true) SCHED_FEAT(LATENCY_WARN, false) +/* + * Do newidle balancing proportional to its success rate using randomization. + */ +SCHED_FEAT(NI_RANDOM, true) + SCHED_FEAT(HZ_BW, true) SCHED_FEAT(IRQ_AVG, false) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index bb581cbdae8d..0c3abb44f365 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -5,6 +5,7 @@ #ifndef _KERNEL_SCHED_SCHED_H #define _KERNEL_SCHED_SCHED_H +#include <linux/prandom.h> #include <linux/sched/affinity.h> #include <linux/sched/autogroup.h> #include <linux/sched/cpufreq.h> @@ -1377,6 +1378,12 @@ static inline bool is_migration_disabled(struct task_struct *p) } DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +DECLARE_PER_CPU(struct rnd_state, sched_rnd_state); + +static inline u32 sched_rng(void) +{ + return prandom_u32_state(this_cpu_ptr(&sched_rnd_state)); +} #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() this_cpu_ptr(&runqueues) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 3023e67da0fd..cf847fdf1063 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1644,6 +1644,12 @@ sd_init(struct sched_domain_topology_level *tl, .last_balance = jiffies, .balance_interval = sd_weight, + + /* 50% success rate */ + .newidle_call = 512, + .newidle_success = 256, + .newidle_ratio = 512, + .max_newidle_lb_cost = 0, .last_decay_max_lb_cost = jiffies, .child = child, -- 2.34.1