[PATCH OLK-6.6 0/2] optimize lightweight load balancing
*** BLURB HERE *** Chen Jinghuang (1): sched: Fix kabi breakage of struct sched_domain for newidle_stats Peter Zijlstra (Intel) (1): sched/fair: Proportional newidle balance include/linux/sched/topology.h | 8 +++++- kernel/sched/core.c | 3 +++ kernel/sched/fair.c | 48 +++++++++++++++++++++++++++++++--- kernel/sched/features.h | 5 ++++ kernel/sched/sched.h | 7 +++++ kernel/sched/topology.c | 6 +++++ 6 files changed, 72 insertions(+), 5 deletions(-) -- 2.34.1
From: "Peter Zijlstra (Intel)" <peterz@infradead.org> stable inclusion from stable-v6.6.120 commit 51445190c10a36d292e70db085d0fb6cc3bec94f category: perf bugzilla: https://atomgit.com/openeuler/kernel/issues/8555 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=... -------------------------------- commit 33cf66d88306663d16e4759e9d24766b0aaa2e17 upstream. Add a randomized algorithm that runs newidle balancing proportional to its success rate. This improves schbench significantly: 6.18-rc4: 2.22 Mrps/s 6.18-rc4+revert: 2.04 Mrps/s 6.18-rc4+revert+random: 2.18 Mrps/S Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%: 6.17: -6% 6.17+revert: 0% 6.17+revert+random: -1% Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com> Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com> Tested-by: Chris Mason <clm@meta.com> Link: https://lkml.kernel.org/r/6825c50d-7fa7-45d8-9b81-c6e7e25738e2@meta.com Link: https://patch.msgid.link/20251107161739.770122091@infradead.org [ Ajay: Modified to apply on v6.6 ] Signed-off-by: Ajay Kaher <ajay.kaher@broadcom.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Conflicts: kernel/sched/core.c kernel/sched/fair.c [context conflicts] Signed-off-by: Chen Jinghuang <chenjinghuang2@huawei.com> --- include/linux/sched/topology.h | 3 +++ kernel/sched/core.c | 3 +++ kernel/sched/fair.c | 44 ++++++++++++++++++++++++++++++---- kernel/sched/features.h | 5 ++++ kernel/sched/sched.h | 7 ++++++ kernel/sched/topology.c | 6 +++++ 6 files changed, 64 insertions(+), 4 deletions(-) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 7eee852aa384..95e5d7772800 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -130,6 +130,9 @@ struct sched_domain { unsigned int nr_balance_failed; /* initialise to 0 */ /* idle_balance() stats */ + unsigned int newidle_call; + unsigned int newidle_success; + unsigned int newidle_ratio; u64 max_newidle_lb_cost; unsigned long last_decay_max_lb_cost; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 69d10fdb84d8..cf45ea94bb05 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -119,6 +119,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +DEFINE_PER_CPU(struct rnd_state, sched_rnd_state); #ifdef CONFIG_QOS_SCHED static void sched_change_qos_group(struct task_struct *tsk, struct task_group *tg); @@ -9976,6 +9977,8 @@ void __init sched_init_smp(void) sched_init_numa(NUMA_NO_NODE); set_sched_cluster(); + prandom_init_once(&sched_rnd_state); + /* * There's no userspace yet to cause hotplug operations; hence all the * CPU masks are stable and all blatant races in the below code cannot diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a4592d35da8b..d16c65d5fa34 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -13651,8 +13651,24 @@ void update_max_interval(void) max_load_balance_interval = HZ*num_online_cpus()/10; } -static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) +static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success) { + sd->newidle_call++; + sd->newidle_success += success; + + if (sd->newidle_call >= 1024) { + sd->newidle_ratio = sd->newidle_success; + sd->newidle_call /= 2; + sd->newidle_success /= 2; + } +} + +static inline bool +update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success) +{ + if (cost) + update_newidle_stats(sd, success); + if (cost > sd->max_newidle_lb_cost) { /* * Track max cost of a domain to make sure to not delay the @@ -13700,7 +13716,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) * Decay the newidle max times here because this is a regular * visit to all the domains. */ - need_decay = update_newidle_cost(sd, 0); + need_decay = update_newidle_cost(sd, 0, 0); max_cost += sd->max_newidle_lb_cost; /* @@ -14336,6 +14352,22 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) break; if (sd->flags & SD_BALANCE_NEWIDLE) { + unsigned int weight = 1; + + if (sched_feat(NI_RANDOM)) { + /* + * Throw a 1k sided dice; and only run + * newidle_balance according to the success + * rate. + */ + u32 d1k = sched_rng() % 1024; + weight = 1 + sd->newidle_ratio; + if (d1k > weight) { + update_newidle_stats(sd, 0); + continue; + } + weight = (1024 + weight/2) / weight; + } pulled_task = load_balance(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, @@ -14343,10 +14375,14 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) t1 = sched_clock_cpu(this_cpu); domain_cost = t1 - t0; - update_newidle_cost(sd, domain_cost); - curr_cost += domain_cost; t0 = t1; + + /* + * Track max cost of a domain to make sure to not delay the + * next wakeup on the CPU. + */ + update_newidle_cost(sd, domain_cost, weight * !!pulled_task); } /* diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 52ea0097c513..24a0c853a8a0 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -106,6 +106,11 @@ SCHED_FEAT(UTIL_EST_FASTUP, true) SCHED_FEAT(LATENCY_WARN, false) +/* + * Do newidle balancing proportional to its success rate using randomization. + */ +SCHED_FEAT(NI_RANDOM, false) + SCHED_FEAT(HZ_BW, true) SCHED_FEAT(IRQ_AVG, false) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index bb581cbdae8d..0c3abb44f365 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -5,6 +5,7 @@ #ifndef _KERNEL_SCHED_SCHED_H #define _KERNEL_SCHED_SCHED_H +#include <linux/prandom.h> #include <linux/sched/affinity.h> #include <linux/sched/autogroup.h> #include <linux/sched/cpufreq.h> @@ -1377,6 +1378,12 @@ static inline bool is_migration_disabled(struct task_struct *p) } DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +DECLARE_PER_CPU(struct rnd_state, sched_rnd_state); + +static inline u32 sched_rng(void) +{ + return prandom_u32_state(this_cpu_ptr(&sched_rnd_state)); +} #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() this_cpu_ptr(&runqueues) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 3023e67da0fd..cf847fdf1063 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1644,6 +1644,12 @@ sd_init(struct sched_domain_topology_level *tl, .last_balance = jiffies, .balance_interval = sd_weight, + + /* 50% success rate */ + .newidle_call = 512, + .newidle_success = 256, + .newidle_ratio = 512, + .max_newidle_lb_cost = 0, .last_decay_max_lb_cost = jiffies, .child = child, -- 2.34.1
hulk inclusion category: bugfix bugzilla: https://atomgit.com/openeuler/kernel/issues/8555 -------------------------------- Fix kabi breakage pf struct sched_domain for newidle_stats Fixes: 9b20f39a39e9 ("sched/fair: Proportional newidle balance") Signed-off-by: Chen Jinghuang <chenjinghuang2@huawei.com> --- include/linux/sched/topology.h | 11 +++++++---- kernel/sched/fair.c | 10 +++++++--- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 95e5d7772800..e48e1347ed86 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -129,10 +129,7 @@ struct sched_domain { unsigned int balance_interval; /* initialise to 1. units in ms. */ unsigned int nr_balance_failed; /* initialise to 0 */ - /* idle_balance() stats */ - unsigned int newidle_call; - unsigned int newidle_success; - unsigned int newidle_ratio; + u64 max_newidle_lb_cost; unsigned long last_decay_max_lb_cost; @@ -179,6 +176,12 @@ struct sched_domain { struct sched_domain_shared *shared; unsigned int span_weight; + + /* idle_balance() stats */ + KABI_FILL_HOLE(unsigned int newidle_call : 10) + KABI_FILL_HOLE(unsigned int newidle_success : 11) + KABI_FILL_HOLE(unsigned int newidle_ratio : 11) + /* * Span of all CPUs in this domain. * diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d16c65d5fa34..3a14d9cdd894 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -13653,14 +13653,18 @@ void update_max_interval(void) static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success) { - sd->newidle_call++; + unsigned int newidle_call = sd->newidle_call; + + newidle_call++; sd->newidle_success += success; - if (sd->newidle_call >= 1024) { + if (newidle_call >= 1024) { sd->newidle_ratio = sd->newidle_success; - sd->newidle_call /= 2; + newidle_call /= 2; sd->newidle_success /= 2; } + + sd->newidle_call = newidle_call; } static inline bool -- 2.34.1
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://atomgit.com/openeuler/kernel/merge_requests/21181 邮件列表地址:https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/IFU... FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://atomgit.com/openeuler/kernel/merge_requests/21181 Mailing list address: https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/IFU...
participants (2)
-
Chen Jinghuang -
patchwork bot