[PATCH OLK-6.6 v11 3/5] sched/fair: Add SMT QoS sched core code

14 May 2026

hulk inclusion
category: feature
bugzilla: https://atomgit.com/openeuler/kernel/issues/8929

----------------------------------------

Reuse QOS_LABEL to distinguish between online and offline tasks. And reuse
QOS_SCHED_DYNAMIC_AFFINITY to select master SMT cpu for online tasks.

Sample CPU utilization of all slave SMT cores within a NUMA node when
collecting load balancing statistics, then select the cpu for
offline task and distribute offline tasks to SMT sibling cores based on
the target SMT sibling CPU utilization watermark,

		+--------+		 +--------+
		|	 | online/offline|	  |
		|  CPU0  |<------------->|  CPU2  |
		|	 |      |	 |	  |
		+--------+      |	 +--------+
		   |	        |	     |
		   | offline    | offline    | offline
		   \/	        |	     \/
		+--------+	|	 +---------+
		|	 |      \/   	 |	   |
		|  CPU1  |<------------->|  CPU3   |
		|	 |     offline	 |         |
		+--------+		 +---------+

Signed-off-by: Jinjie Ruan <ruanjinjie@huawei.com>
---
 arch/arm64/Kconfig.turbo |  17 ++++
 kernel/sched/Makefile    |   1 +
 kernel/sched/fair.c      |  53 ++++++++++--
 kernel/sched/features.h  |   4 +
 kernel/sched/sched.h     |  38 +++++++++
 kernel/sched/smt_qos.c   | 173 +++++++++++++++++++++++++++++++++++++++
 6 files changed, 280 insertions(+), 6 deletions(-)
 create mode 100644 kernel/sched/smt_qos.c

diff --git a/arch/arm64/Kconfig.turbo b/arch/arm64/Kconfig.turbo
index 769fde5fffd7..d6a18d0c4513 100644
--- a/arch/arm64/Kconfig.turbo
+++ b/arch/arm64/Kconfig.turbo
@@ -95,4 +95,21 @@ config ARM64_COPY_FROM_USER_OPT
 	  for higher throughput on large copies. This can be
 	  disabled at boot via copy_opt_disable.
 
+config SMT_QOS
+	bool "Support userspace timer/wft to reduce intra-core contention"
+	depends on SCHED_SMT
+	depends on FAST_IRQ
+	depends on QOS_SCHED_DYNAMIC_AFFINITY
+	depends on CFS_BANDWIDTH && CGROUP_SCHED
+	select QOS_LEVEL
+	default y
+	help
+	  Cloud Service Provider deploy Best-Effort and Latency Sensitive tasks
+	  on the same physical core to maximize the resource utilization. We
+	  observe the LS task needs more cycles to complete the same workload
+	  due to the uarch resource contention. This feature control the
+	  instruction throughput of BS task into pipeline, so that the other
+	  SMT running LS task could occupy more uarch resources to reach better
+	  IPC.
+
 endmenu # "Turbo features selection"
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index cd0be22a94fd..824fd274abec 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -33,3 +33,4 @@ obj-y += fair.o
 obj-y += build_policy.o
 obj-y += build_utility.o
 obj-$(CONFIG_QOS_SCHED_SMART_GRID) += grid/
+obj-$(CONFIG_SMT_QOS)		   += smt_qos.o
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 007d42c553e5..82c0668251e8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6955,7 +6955,6 @@ static DEFINE_MUTEX(smart_grid_used_mutex);
 static unsigned long capacity_of(int cpu);
 static int sched_idle_cpu(int cpu);
 static unsigned long cpu_runnable(struct rq *rq);
-static inline bool prefer_cpus_valid(struct task_struct *p);
 
 struct static_key __smart_grid_used;
 
@@ -6969,7 +6968,7 @@ static void smart_grid_usage_dec(void)
 	static_key_slow_dec(&__smart_grid_used);
 }
 
-static inline struct cpumask *task_prefer_cpus(struct task_struct *p)
+struct cpumask *task_prefer_cpus(struct task_struct *p)
 {
 	if (!smart_grid_used() ||
 	    !task_group(p)->auto_affinity)
@@ -7444,9 +7443,7 @@ int tg_rebuild_affinity_domains(int cpu, struct auto_affinity *auto_affi)
 static void __maybe_unused destroy_auto_affinity(struct task_group *tg) {}
 
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
-static inline bool prefer_cpus_valid(struct task_struct *p);
-
-static inline struct cpumask *task_prefer_cpus(struct task_struct *p)
+struct cpumask *task_prefer_cpus(struct task_struct *p)
 {
 	return p->prefer_cpus;
 }
@@ -9187,7 +9184,7 @@ static int __init dynamic_affinity_switch_setup(char *str)
 }
 __setup("dynamic_affinity=", dynamic_affinity_switch_setup);
 
-static inline bool prefer_cpus_valid(struct task_struct *p)
+bool prefer_cpus_valid(struct task_struct *p)
 {
 	struct cpumask *prefer_cpus = task_prefer_cpus(p);
 
@@ -9352,6 +9349,11 @@ static int wake_soft_domain(struct task_struct *p, int target)
 #ifdef CONFIG_QOS_SCHED
 static __always_inline bool qos_sched_enabled(void)
 {
+#ifdef CONFIG_SMT_QOS
+	if (sched_feat(SMT_TAG_PULL))
+		return false;
+#endif
+
 	return true;
 }
 #endif
@@ -9378,6 +9380,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 	/* SD_flags and WF_flags share the first nibble */
 	int sd_flag = wake_flags & 0xF;
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	const cpumask_t *backup_select_cpus;
 	int idlest_cpu = -1;
 #endif
 #ifdef CONFIG_BPF_SCHED
@@ -9394,6 +9397,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
 	set_task_select_cpus(p, &idlest_cpu, sd_flag);
+	set_qos_task_select_cpus(p, &idlest_cpu, prev_cpu, &backup_select_cpus);
 #endif
 
 	if (wake_flags & WF_TTWU) {
@@ -9488,7 +9492,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 		new_cpu = idlest_cpu;
 		schedstat_inc(p->stats.nr_wakeups_force_preferred_cpus);
 	}
+
+	restore_qos_task_select_cpus(p, backup_select_cpus);
 #endif
+
 	return new_cpu;
 }
 
@@ -10419,6 +10426,7 @@ done: __maybe_unused;
 	qos_smt_expel(this_cpu, p);
 #endif
 
+	smt_qos_update_qos_level(rq->cpu, p);
 	return p;
 
 idle:
@@ -10478,6 +10486,8 @@ done: __maybe_unused;
 	qos_smt_expel(this_cpu, NULL);
 #endif
 
+	smt_qos_update_qos_level(rq->cpu, NULL);
+
 	return NULL;
 }
 
@@ -10904,6 +10914,9 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	}
 #endif
 
+	if (!smt_qos_can_migrate_task(p, env->src_cpu, env->dst_cpu))
+		return 0;
+
 	/*
 	 * We do not migrate tasks that are:
 	 * 1) throttled_lb_pair, or
@@ -11536,6 +11549,10 @@ struct sd_lb_stats {
 
 	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
 	struct sg_lb_stats local_stat;	/* Statistics of the local group */
+#ifdef CONFIG_SMT_QOS
+	unsigned long total_smt_util;     /* Total utilization of all groups in sd */
+	unsigned long total_smt_capacity; /* Total capacity of all groups in sd */
+#endif
 };
 
 static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
@@ -11938,6 +11955,19 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
 	return check_cpu_capacity(rq, sd);
 }
 
+#ifdef CONFIG_SMT_QOS
+static inline void smt_qos_update_sg_lb_stats(struct sd_lb_stats *sds, int cpu)
+{
+	if (!smt_qos_enabled())
+		return;
+
+	if (!cpumask_test_cpu(cpu, &master_smt_cpumask)) {
+		sds->total_smt_util += cpu_util_cfs(cpu);
+		sds->total_smt_capacity += capacity_orig_of(cpu);
+	}
+}
+#endif
+
 /**
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  * @env: The load balancing environment.
@@ -11967,6 +11997,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		sgs->group_runnable += cpu_runnable(rq);
 		sgs->sum_h_nr_running += rq->cfs.h_nr_running;
 
+#ifdef CONFIG_SMT_QOS
+		smt_qos_update_sg_lb_stats(sds, i);
+#endif
+
 		nr_running = rq->nr_running;
 		sgs->sum_nr_running += nr_running;
 
@@ -12700,6 +12734,10 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 	}
 
 	update_idle_cpu_scan(env, sum_util);
+#ifdef CONFIG_SMT_QOS
+	smt_qos_update_sd_ld_stats(env->sd, env->dst_cpu, sds->total_smt_capacity,
+				   sds->total_smt_util);
+#endif
 }
 
 /**
@@ -13094,6 +13132,9 @@ static struct rq *find_busiest_queue(struct lb_env *env,
 		if (!nr_running)
 			continue;
 
+		if (smt_qos_should_not_busiest(i, env->dst_cpu))
+			continue;
+
 		capacity = capacity_of(i);
 
 		/*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 85304c4b2568..a40228db1acb 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -135,3 +135,7 @@ SCHED_FEAT(SOFT_QUOTA, false)
 #endif
 
 SCHED_FEAT(WA_SMT, false)
+
+#ifdef CONFIG_SMT_QOS
+SCHED_FEAT(SMT_TAG_PULL, false)
+#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5d68f3a66d28..4a6811ef3ebd 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3788,4 +3788,42 @@ static inline int destroy_soft_domain(struct task_group *tg)
 
 #endif
 
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+extern bool prefer_cpus_valid(struct task_struct *p);
+extern struct cpumask *task_prefer_cpus(struct task_struct *p);
+#endif
+
+#ifdef CONFIG_SMT_QOS
+extern cpumask_t master_smt_cpumask;
+
+static inline bool smt_qos_enabled(void)
+{
+	return sched_smt_active() && sched_feat(SMT_TAG_PULL);
+}
+
+extern void set_qos_task_select_cpus(struct task_struct *p, int *idlest_cpu, int prev_cpu,
+				     const cpumask_t **backup_select_cpus);
+extern void restore_qos_task_select_cpus(struct task_struct *p,
+					 const cpumask_t *backup_select_cpus);
+extern void smt_qos_update_qos_level(int cpu, struct task_struct *p);
+extern bool smt_qos_should_not_busiest(int src_cpu, int dst_cpu);
+extern bool smt_qos_can_migrate_task(struct task_struct *p, int src_cpu, int dst_cpu);
+extern void smt_qos_update_sd_ld_stats(struct sched_domain *sd, int dst_cpu,
+				       unsigned long smt_capacity, unsigned long smt_util);
+#else
+static inline void set_qos_task_select_cpus(struct task_struct *p, int *idlest_cpu, int prev_cpu,
+					    const cpumask_t **backup_select_cpus) { }
+static inline void restore_qos_task_select_cpus(struct task_struct *p,
+						const cpumask_t *backup_select_cpus) { }
+static inline void smt_qos_update_qos_level(int cpu, struct task_struct *p) { }
+static inline bool smt_qos_should_not_busiest(int src_cpu, int dst_cpu)
+{
+	return false;
+}
+static inline bool smt_qos_can_migrate_task(struct task_struct *p, int src_cpu, int dst_cpu)
+{
+	return true;
+}
+#endif
+
 #endif /* _KERNEL_SCHED_SCHED_H */
diff --git a/kernel/sched/smt_qos.c b/kernel/sched/smt_qos.c
new file mode 100644
index 000000000000..799128af37ce
--- /dev/null
+++ b/kernel/sched/smt_qos.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt)	"smt_qos: " fmt
+
+#include <linux/cpumask.h>
+#include <linux/init.h>
+#include <linux/numa.h>
+#include <linux/percpu.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/smt.h>
+#include <linux/sched/topology.h>
+#include <linux/sysctl.h>
+
+#include "sched.h"
+
+static DEFINE_PER_CPU_ALIGNED(cpumask_t, smt_prefer_cpus);
+static unsigned long numa_smt_util[MAX_NUMNODES];
+/*
+ * Target SMT sibling CPU utilization watermark.
+ * Default range: 0-100.
+ */
+static unsigned int sched_smt_offline_util_pct = 50;
+static cpumask_t slave_smt_cpumask;
+cpumask_t master_smt_cpumask;
+
+static struct ctl_table smt_util_pct_sysctl_table[] = {
+	{
+		.procname	= "sched_smt_offline_util_pct",
+		.data		= &sched_smt_offline_util_pct,
+		.maxlen		= sizeof(sched_smt_offline_util_pct),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE_HUNDRED,
+	},
+	{}
+};
+
+static int __init sched_init_smt_qos(void)
+{
+	int cpu;
+
+	if (!sched_smt_active())
+		return 0;
+
+	register_sysctl_init("kernel", smt_util_pct_sysctl_table);
+
+	cpumask_copy(&master_smt_cpumask, cpu_possible_mask);
+	for_each_possible_cpu(cpu) {
+		if (cpu != cpumask_first(cpu_smt_mask(cpu)))
+			cpumask_clear_cpu(cpu, &master_smt_cpumask);
+	}
+
+	cpumask_andnot(&slave_smt_cpumask, cpu_possible_mask, &master_smt_cpumask);
+	pr_info("Master SMT mask: %*pbl\n", cpumask_pr_args(&master_smt_cpumask));
+	pr_info("Slave SMT mask: %*pbl\n", cpumask_pr_args(&slave_smt_cpumask));
+
+	return 0;
+}
+late_initcall(sched_init_smt_qos);
+
+void set_qos_task_select_cpus(struct task_struct *p, int *idlest_cpu, int prev_cpu,
+			      const cpumask_t **backup_select_cpus)
+{
+	cpumask_t *prefer_cpus = this_cpu_ptr(&smt_prefer_cpus);
+	cpumask_t *prefer_cpumask = &master_smt_cpumask;
+
+	if (!smt_qos_enabled())
+		return;
+
+	if (task_group(p)->qos_level < QOS_LEVEL_ONLINE) {
+		unsigned long smt_util = numa_smt_util[cpu_to_node(prev_cpu)];
+
+		if (smt_util < sched_smt_offline_util_pct)
+			prefer_cpumask = &slave_smt_cpumask;
+	}
+
+	*backup_select_cpus = p->select_cpus;
+	if (unlikely(prefer_cpus_valid(p))) {
+		cpumask_and(prefer_cpus, task_prefer_cpus(p), prefer_cpumask);
+		if (!cpumask_empty(prefer_cpus))
+			p->select_cpus = prefer_cpus;
+		else
+			p->select_cpus = task_prefer_cpus(p);
+	} else {
+		cpumask_and(prefer_cpus, p->cpus_ptr, prefer_cpumask);
+		if (!cpumask_empty(prefer_cpus))
+			p->select_cpus = prefer_cpus;
+		else
+			p->select_cpus = p->cpus_ptr;
+	}
+
+	if (*idlest_cpu == -1 || !cpumask_test_cpu(*idlest_cpu, p->select_cpus))
+		*idlest_cpu = cpumask_first(p->select_cpus);
+}
+
+void restore_qos_task_select_cpus(struct task_struct *p, const cpumask_t *backup_select_cpus)
+{
+	if (!backup_select_cpus)
+		return;
+
+	p->select_cpus = backup_select_cpus;
+}
+
+void smt_qos_update_qos_level(int cpu, struct task_struct *p)
+{
+	int new_status;
+
+	if (!smt_qos_enabled())
+		return;
+
+	new_status = p ? task_group(p)->qos_level : QOS_LEVEL_OFFLINE;
+
+	if (likely(new_status == __this_cpu_read(qos_smt_status)))
+		return;
+
+	__this_cpu_write(qos_smt_status, new_status);
+}
+
+static __always_inline bool is_slave_to_master(int src_cpu, int dst_cpu)
+{
+	return !cpumask_test_cpu(src_cpu, &master_smt_cpumask) &&
+		cpumask_test_cpu(dst_cpu, &master_smt_cpumask);
+}
+
+bool smt_qos_should_not_busiest(int src_cpu, int dst_cpu)
+{
+	if (!smt_qos_enabled())
+		return 0;
+
+	/*
+	 * Migration of tasks from SMT siblings to
+	 * the primary SMT CPU is restricted.
+	 */
+	return is_slave_to_master(src_cpu, dst_cpu);
+}
+
+bool smt_qos_can_migrate_task(struct task_struct *p, int src_cpu, int dst_cpu)
+{
+	if (!smt_qos_enabled())
+		return 1;
+
+	/*
+	 * Only offline tasks are allowed to be migrated from
+	 * primary SMT CPUs to SMT siblings.
+	 */
+	if (cpumask_test_cpu(src_cpu, &master_smt_cpumask) &&
+	    !cpumask_test_cpu(dst_cpu, &master_smt_cpumask)) {
+		unsigned long smt_util;
+
+		if (task_group(p)->qos_level >= QOS_LEVEL_ONLINE)
+			return 0;
+
+		smt_util = numa_smt_util[cpu_to_node(dst_cpu)];
+		if (smt_util >= sched_smt_offline_util_pct)
+			return 0;
+	}
+
+	/*
+	 * Migration of tasks from SMT siblings to
+	 * the primary SMT CPU is restricted.
+	 */
+	return !is_slave_to_master(src_cpu, dst_cpu);
+}
+
+void smt_qos_update_sd_ld_stats(struct sched_domain *sd, int dst_cpu,
+				unsigned long smt_capacity, unsigned long smt_util)
+{
+	if (!smt_qos_enabled() || !smt_capacity)
+		return;
+
+	if (!(sd->flags & SD_NUMA) && (sd->parent && (sd->parent->flags & SD_NUMA)))
+		numa_smt_util[cpu_to_node(dst_cpu)] = (smt_util * 100) / smt_capacity;
+}
-- 
2.34.1