Song Zhang (1): sched: Introduce priority load balance for qos scheduler
arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + init/Kconfig | 9 +++ kernel/sched/core.c | 3 + kernel/sched/fair.c | 83 +++++++++++++++++++++++++- kernel/sched/sched.h | 3 + 6 files changed, 99 insertions(+), 1 deletion(-)
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8MWDD CVE: NA
-------------------------------------------------
Add new sysctl interface: /proc/sys/kernel/sched_prio_load_balance_enabled
0: default behavior 1: enable priority load balance for qos scheduler
For tasks co-location with qos scheduler, when CFS do load balance, it is reasonable to prefer migrating online(Latency Sensitive) tasks. So the CFS load balance can be changed to below:
cfs_tasks list is owned by online tasks. Add new cfs_offline_tasks list which is owned by offline tasks. Prefer to migrate the online tasks of cfs_tasks list to dst rq. In the scenario of hyperthread interference, if the smt expeller feature enabled, CPU A and CPU B are two hyperthreads on a physical core, CPU A runs online tasks while CPU B only has offline tasks, The offline tasks on CPU B are expelled by the online tasks on CPU A and cannot be scheduled. However, when load balance is triggered, before CPU B can migrate some online tasks from CPU A, the load on the two cpus is already balanced. As a result, CPU B cannot run online tasks and online tasks cannot be evenly distributed among different cpus.
Signed-off-by: Song Zhang zhangsong34@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + init/Kconfig | 9 +++ kernel/sched/core.c | 3 + kernel/sched/fair.c | 83 +++++++++++++++++++++++++- kernel/sched/sched.h | 3 + 6 files changed, 99 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index c5bb420feb..8bc17529ea 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -166,6 +166,7 @@ CONFIG_CGROUP_WRITEBACK=y CONFIG_CGROUP_V1_WRITEBACK=y CONFIG_CGROUP_SCHED=y CONFIG_QOS_SCHED=y +CONFIG_QOS_SCHED_PRIO_LB=y CONFIG_FAIR_GROUP_SCHED=y CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 15479398d9..dee5e07526 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -188,6 +188,7 @@ CONFIG_CGROUP_WRITEBACK=y CONFIG_CGROUP_V1_WRITEBACK=y CONFIG_CGROUP_SCHED=y CONFIG_QOS_SCHED=y +CONFIG_QOS_SCHED_PRIO_LB=y CONFIG_FAIR_GROUP_SCHED=y CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y diff --git a/init/Kconfig b/init/Kconfig index c8909ca8bb..9a4607729a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1031,6 +1031,15 @@ config QOS_SCHED
If in doubt, say N.
+config QOS_SCHED_PRIO_LB + bool "Priority load balance for Qos scheduler" + depends on QOS_SCHED + default n + help + This feature enable priority load balance + for Qos scheduler, which prefer migrating online tasks + and migrating offline tasks secondly between CPUs. + config FAIR_GROUP_SCHED bool "Group scheduling for SCHED_OTHER" depends on CGROUP_SCHED diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 58c274b655..f40ca56bec 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10045,6 +10045,9 @@ void __init sched_init(void) rq->max_idle_balance_cost = sysctl_sched_migration_cost;
INIT_LIST_HEAD(&rq->cfs_tasks); +#ifdef CONFIG_QOS_SCHED_PRIO_LB + INIT_LIST_HEAD(&rq->cfs_offline_tasks); +#endif
rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ_COMMON diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 640c0a73e7..6a28cd45de 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -147,6 +147,10 @@ static int hundred_thousand = 100000; static int unthrottle_qos_cfs_rqs(int cpu); #endif
+#ifdef CONFIG_QOS_SCHED_PRIO_LB +unsigned int sysctl_sched_prio_load_balance_enabled; +#endif + #ifdef CONFIG_CFS_BANDWIDTH /* * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool @@ -234,6 +238,17 @@ static struct ctl_table sched_fair_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE_HUNDRED, }, +#endif +#ifdef CONFIG_QOS_SCHED_PRIO_LB + { + .procname = "sched_prio_load_balance_enabled", + .data = &sysctl_sched_prio_load_balance_enabled, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, #endif {} }; @@ -3585,6 +3600,21 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu)
#endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_QOS_SCHED_PRIO_LB +static void +adjust_rq_cfs_tasks(void (*list_op)(struct list_head *, struct list_head *), + struct rq *rq, + struct sched_entity *se) +{ + struct task_group *tg = task_group(task_of(se)); + + if (sysctl_sched_prio_load_balance_enabled && tg->qos_level == -1) + (*list_op)(&se->group_node, &rq->cfs_offline_tasks); + else + (*list_op)(&se->group_node, &rq->cfs_tasks); +} +#endif + static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -3594,7 +3624,11 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) struct rq *rq = rq_of(cfs_rq);
account_numa_enqueue(rq, task_of(se)); +#ifdef CONFIG_QOS_SCHED_PRIO_LB + adjust_rq_cfs_tasks(list_add, rq, se); +#else list_add(&se->group_node, &rq->cfs_tasks); +#endif } #endif cfs_rq->nr_running++; @@ -8885,7 +8919,11 @@ done: __maybe_unused; * the list, so our cfs_tasks list becomes MRU * one. */ +#ifdef CONFIG_QOS_SCHED_PRIO_LB + adjust_rq_cfs_tasks(list_move, rq, &p->se); +#else list_move(&p->se.group_node, &rq->cfs_tasks); +#endif #endif
if (hrtick_enabled_fair(rq)) @@ -9233,6 +9271,14 @@ static int task_hot(struct task_struct *p, struct lb_env *env) (&p->se == cfs_rq_of(&p->se)->next)) return 1;
+#ifdef CONFIG_QOS_SCHED_PRIO_LB + /* Preempt sched idle cpu do not consider migration cost */ + if (sysctl_sched_prio_load_balance_enabled && + cpus_share_cache(env->src_cpu, env->dst_cpu) && + sched_idle_cpu(env->dst_cpu)) + return 0; +#endif + if (sysctl_sched_migration_cost == -1) return 1;
@@ -9432,11 +9478,18 @@ static void detach_task(struct task_struct *p, struct lb_env *env) static struct task_struct *detach_one_task(struct lb_env *env) { struct task_struct *p; + struct list_head *tasks = &env->src_rq->cfs_tasks; +#ifdef CONFIG_QOS_SCHED_PRIO_LB + int loop = 0; +#endif
lockdep_assert_rq_held(env->src_rq);
+#ifdef CONFIG_QOS_SCHED_PRIO_LB +again: +#endif list_for_each_entry_reverse(p, - &env->src_rq->cfs_tasks, se.group_node) { + tasks, se.group_node) { if (!can_migrate_task(p, env)) continue;
@@ -9451,6 +9504,15 @@ static struct task_struct *detach_one_task(struct lb_env *env) schedstat_inc(env->sd->lb_gained[env->idle]); return p; } +#ifdef CONFIG_QOS_SCHED_PRIO_LB + if (sysctl_sched_prio_load_balance_enabled) { + loop++; + if (loop == 1) { + tasks = &env->src_rq->cfs_offline_tasks; + goto again; + } + } +#endif return NULL; }
@@ -9466,6 +9528,9 @@ static int detach_tasks(struct lb_env *env) unsigned long util, load; struct task_struct *p; int detached = 0; +#ifdef CONFIG_QOS_SCHED_PRIO_LB + int loop = 0; +#endif
lockdep_assert_rq_held(env->src_rq);
@@ -9481,6 +9546,9 @@ static int detach_tasks(struct lb_env *env) if (env->imbalance <= 0) return 0;
+#ifdef CONFIG_QOS_SCHED_PRIO_LB +again: +#endif while (!list_empty(tasks)) { /* * We don't want to steal all, otherwise we may be treated likewise, @@ -9586,6 +9654,15 @@ static int detach_tasks(struct lb_env *env) list_move(&p->se.group_node, tasks); }
+#ifdef CONFIG_QOS_SCHED_PRIO_LB + if (sysctl_sched_prio_load_balance_enabled && env->imbalance > 0) { + loop++; + if (loop == 1) { + tasks = &env->src_rq->cfs_offline_tasks; + goto again; + } + } +#endif /* * Right now, this is one of only two places we collect this stat * so we can safely collect detach_one_task() stats here rather @@ -13212,7 +13289,11 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) * Move the next running task to the front of the list, so our * cfs_tasks list becomes MRU one. */ +#ifdef CONFIG_QOS_SCHED_PRIO_LB + adjust_rq_cfs_tasks(list_move, rq, se); +#else list_move(&se->group_node, &rq->cfs_tasks); +#endif } #endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3de84e95ba..76951226b6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1075,6 +1075,9 @@ struct rq { int online;
struct list_head cfs_tasks; +#ifdef CONFIG_QOS_SCHED_PRIO_LB + struct list_head cfs_offline_tasks; +#endif
struct sched_avg avg_rt; struct sched_avg avg_dl;
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/3688 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/D...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/3688 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/D...