[PATCH OLK-6.6 1/1] sched: Introduce priority load balance for qos scheduler

19 Dec 2023

euleros inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I8MWDD
CVE: NA

-------------------------------------------------

Add new sysctl interface:
/proc/sys/kernel/sched_prio_load_balance_enabled

0: default behavior
1: enable priority load balance for qos scheduler

For tasks co-location with qos scheduler, when CFS do load balance,
it is reasonable to prefer migrating online(Latency Sensitive) tasks.
So the CFS load balance can be changed to below:

cfs_tasks list is owned by online tasks.
Add new cfs_offline_tasks list which is owned by offline tasks.
Prefer to migrate the online tasks of cfs_tasks list to dst rq.
In the scenario of hyperthread interference, if the smt expeller feature
enabled, CPU A and CPU B are two hyperthreads on a physical core,
CPU A runs online tasks while CPU B only has offline tasks, The offline
tasks on CPU B are expelled by the online tasks on CPU A and cannot be
scheduled. However, when load balance is triggered, before CPU B can
migrate some online tasks from CPU A, the load on the two cpus is already
balanced. As a result, CPU B cannot run online tasks and online tasks
cannot be evenly distributed among different cpus.

Signed-off-by: Song Zhang <zhangsong34@huawei.com>
---
 arch/arm64/configs/openeuler_defconfig |  1 +
 arch/x86/configs/openeuler_defconfig   |  1 +
 include/linux/sched/sysctl.h           |  4 ++
 init/Kconfig                           |  9 +++
 kernel/sched/core.c                    |  3 +
 kernel/sched/fair.c                    | 90 +++++++++++++++++++++++++-
 kernel/sched/sched.h                   |  3 +
 7 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig
index 039f3496af..296421c0d5 100644
--- a/arch/arm64/configs/openeuler_defconfig
+++ b/arch/arm64/configs/openeuler_defconfig
@@ -162,6 +162,7 @@ CONFIG_BLK_CGROUP=y
 CONFIG_CGROUP_WRITEBACK=y
 CONFIG_CGROUP_SCHED=y
 CONFIG_QOS_SCHED=y
+CONFIG_QOS_SCHED_PRIO_LB=y
 CONFIG_FAIR_GROUP_SCHED=y
 CONFIG_CFS_BANDWIDTH=y
 CONFIG_RT_GROUP_SCHED=y
diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig
index f481c1c8b4..efb9a1efa7 100644
--- a/arch/x86/configs/openeuler_defconfig
+++ b/arch/x86/configs/openeuler_defconfig
@@ -184,6 +184,7 @@ CONFIG_BLK_CGROUP=y
 CONFIG_CGROUP_WRITEBACK=y
 CONFIG_CGROUP_SCHED=y
 CONFIG_QOS_SCHED=y
+CONFIG_QOS_SCHED_PRIO_LB=y
 CONFIG_FAIR_GROUP_SCHED=y
 CONFIG_CFS_BANDWIDTH=y
 CONFIG_RT_GROUP_SCHED=y
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 5a64582b08..efee7213a0 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -29,4 +29,8 @@ extern int sysctl_numa_balancing_mode;
 #define sysctl_numa_balancing_mode	0
 #endif
 
+#ifdef CONFIG_QOS_SCHED_PRIO_LB
+extern unsigned int sysctl_sched_prio_load_balance_enabled;
+#endif
+
 #endif /* _LINUX_SCHED_SYSCTL_H */
diff --git a/init/Kconfig b/init/Kconfig
index b6952df34e..c177b7591a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -997,6 +997,15 @@ config QOS_SCHED
 
 	  If in doubt, say N.
 
+config QOS_SCHED_PRIO_LB
+	bool "Priority load balance for Qos scheduler"
+	depends on QOS_SCHED
+	default n
+	help
+	  This feature enable priority load balance
+	  for Qos scheduler, which prefer migrating online tasks
+	  and migrating offline tasks secondly between CPUs.
+
 config FAIR_GROUP_SCHED
 	bool "Group scheduling for SCHED_OTHER"
 	depends on CGROUP_SCHED
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a1c73dea1f..d68ddb4d28 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10045,6 +10045,9 @@ void __init sched_init(void)
 		rq->max_idle_balance_cost = sysctl_sched_migration_cost;
 
 		INIT_LIST_HEAD(&rq->cfs_tasks);
+#ifdef CONFIG_QOS_SCHED_PRIO_LB
+		INIT_LIST_HEAD(&rq->cfs_offline_tasks);
+#endif
 
 		rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ_COMMON
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8ae0d65713..3b35448448 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -147,6 +147,10 @@ static int hundred_thousand = 100000;
 static int unthrottle_qos_cfs_rqs(int cpu);
 #endif
 
+#ifdef CONFIG_QOS_SCHED_PRIO_LB
+unsigned int sysctl_sched_prio_load_balance_enabled;
+#endif
+
 #ifdef CONFIG_CFS_BANDWIDTH
 /*
  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
@@ -214,6 +218,17 @@ static struct ctl_table sched_fair_sysctls[] = {
 		.extra1		= SYSCTL_ONE_HUNDRED,
 		.extra2		= &one_thousand,
 	},
+#endif
+#ifdef CONFIG_QOS_SCHED_PRIO_LB
+	{
+		.procname	= "sched_prio_load_balance_enabled",
+		.data		= &sysctl_sched_prio_load_balance_enabled,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
 #endif
 	{}
 };
@@ -3565,6 +3580,21 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu)
 
 #endif /* CONFIG_NUMA_BALANCING */
 
+#ifdef CONFIG_QOS_SCHED_PRIO_LB
+static void
+adjust_rq_cfs_tasks(void (*list_op)(struct list_head *, struct list_head *),
+	struct rq *rq,
+	struct sched_entity *se)
+{
+	struct task_group *tg = task_group(task_of(se));
+
+	if (sysctl_sched_prio_load_balance_enabled && tg->qos_level == -1)
+		(*list_op)(&se->group_node, &rq->cfs_offline_tasks);
+	else
+		(*list_op)(&se->group_node, &rq->cfs_tasks);
+}
+#endif
+
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -3574,7 +3604,11 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		struct rq *rq = rq_of(cfs_rq);
 
 		account_numa_enqueue(rq, task_of(se));
+#ifdef CONFIG_QOS_SCHED_PRIO_LB
+		adjust_rq_cfs_tasks(list_add, rq, se);
+#else
 		list_add(&se->group_node, &rq->cfs_tasks);
+#endif
 	}
 #endif
 	cfs_rq->nr_running++;
@@ -8611,7 +8645,11 @@ done: __maybe_unused;
 	 * the list, so our cfs_tasks list becomes MRU
 	 * one.
 	 */
+#ifdef CONFIG_QOS_SCHED_PRIO_LB
+	adjust_rq_cfs_tasks(list_move, rq, &p->se);
+#else
 	list_move(&p->se.group_node, &rq->cfs_tasks);
+#endif
 #endif
 
 	if (hrtick_enabled_fair(rq))
@@ -8959,6 +8997,14 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
 	    (&p->se == cfs_rq_of(&p->se)->next))
 		return 1;
 
+#ifdef CONFIG_QOS_SCHED_PRIO_LB
+	/* Preempt sched idle cpu do not consider migration cost */
+	if (sysctl_sched_prio_load_balance_enabled &&
+	    cpus_share_cache(env->src_cpu, env->dst_cpu) &&
+	    sched_idle_cpu(env->dst_cpu))
+		return 0;
+#endif
+
 	if (sysctl_sched_migration_cost == -1)
 		return 1;
 
@@ -9149,11 +9195,18 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
 static struct task_struct *detach_one_task(struct lb_env *env)
 {
 	struct task_struct *p;
+	struct list_head *tasks = &env->src_rq->cfs_tasks;
+#ifdef CONFIG_QOS_SCHED_PRIO_LB
+	int loop = 0;
+#endif
 
 	lockdep_assert_rq_held(env->src_rq);
 
+#ifdef CONFIG_QOS_SCHED_PRIO_LB
+again:
+#endif
 	list_for_each_entry_reverse(p,
-			&env->src_rq->cfs_tasks, se.group_node) {
+			tasks, se.group_node) {
 		if (!can_migrate_task(p, env))
 			continue;
 
@@ -9168,6 +9221,15 @@ static struct task_struct *detach_one_task(struct lb_env *env)
 		schedstat_inc(env->sd->lb_gained[env->idle]);
 		return p;
 	}
+#ifdef CONFIG_QOS_SCHED_PRIO_LB
+	if (sysctl_sched_prio_load_balance_enabled) {
+		loop++;
+		if (loop == 1) {
+			tasks = &env->src_rq->cfs_offline_tasks;
+			goto again;
+		}
+	}
+#endif
 	return NULL;
 }
 
@@ -9183,6 +9245,9 @@ static int detach_tasks(struct lb_env *env)
 	unsigned long util, load;
 	struct task_struct *p;
 	int detached = 0;
+#ifdef CONFIG_QOS_SCHED_PRIO_LB
+	int loop = 0;
+#endif
 
 	lockdep_assert_rq_held(env->src_rq);
 
@@ -9198,6 +9263,9 @@ static int detach_tasks(struct lb_env *env)
 	if (env->imbalance <= 0)
 		return 0;
 
+#ifdef CONFIG_QOS_SCHED_PRIO_LB
+again:
+#endif
 	while (!list_empty(tasks)) {
 		/*
 		 * We don't want to steal all, otherwise we may be treated likewise,
@@ -9303,6 +9371,22 @@ static int detach_tasks(struct lb_env *env)
 		list_move(&p->se.group_node, tasks);
 	}
 
+#ifdef CONFIG_QOS_SCHED_PRIO_LB
+	if (sysctl_sched_prio_load_balance_enabled && env->imbalance > 0) {
+		/*
+		 * Avoid offline tasks starve to death if env->loop exceed
+		 * env->loop_max, so we should set env->loop to 0 and detach
+		 * offline tasks again.
+		*/
+		if (env->loop > env->loop_max)
+			env->loop = 0;
+		loop++;
+		if (loop == 1) {
+			tasks = &env->src_rq->cfs_offline_tasks;
+			goto again;
+		}
+	}
+#endif
 	/*
 	 * Right now, this is one of only two places we collect this stat
 	 * so we can safely collect detach_one_task() stats here rather
@@ -12920,7 +13004,11 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
 		 * Move the next running task to the front of the list, so our
 		 * cfs_tasks list becomes MRU one.
 		 */
+#ifdef CONFIG_QOS_SCHED_PRIO_LB
+		adjust_rq_cfs_tasks(list_move, rq, se);
+#else
 		list_move(&se->group_node, &rq->cfs_tasks);
+#endif
 	}
 #endif
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3de84e95ba..76951226b6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1075,6 +1075,9 @@ struct rq {
 	int			online;
 
 	struct list_head cfs_tasks;
+#ifdef CONFIG_QOS_SCHED_PRIO_LB
+	struct list_head cfs_offline_tasks;
+#endif
 
 	struct sched_avg	avg_rt;
 	struct sched_avg	avg_dl;
-- 
2.33.0