[PATCH OLK-6.6 v10 3/5] sched/fair: Add SMT QoS sched core code

8 May 2026

hulk inclusion
category: feature
bugzilla: https://atomgit.com/openeuler/kernel/issues/8929

----------------------------------------

Reuse QOS_LABEL to distinguish between online and offline tasks. And reuse
QOS_SCHED_DYNAMIC_AFFINITY to select master SMT cpu for online tasks.

Sample CPU utilization of all slave SMT cores within a NUMA node when
collecting load balancing statistics, then select the cpu for
offline task and distribute offline tasks to SMT sibling cores based on
the target SMT sibling CPU utilization watermark,

		+--------+		 +--------+
		|	 | online/offline|	  |
		|  CPU0  |<------------->|  CPU2  |
		|	 |      |	 |	  |
		+--------+      |	 +--------+
		   |	        |	     |
		   | offline    | offline    | offline
		   \/	        |	     \/
		+--------+	|	 +---------+
		|	 |      \/   	 |	   |
		|  CPU1  |<------------->|  CPU3   |
		|	 |     offline	 |         |
		+--------+		 +---------+

Signed-off-by: Jinjie Ruan <ruanjinjie@huawei.com>
---
 arch/arm64/Kconfig.turbo |  17 ++++
 kernel/sched/fair.c      | 212 +++++++++++++++++++++++++++++++++++++++
 kernel/sched/features.h  |   4 +
 3 files changed, 233 insertions(+)

diff --git a/arch/arm64/Kconfig.turbo b/arch/arm64/Kconfig.turbo
index 778ea1025c2c..aa0af04cb2ab 100644
--- a/arch/arm64/Kconfig.turbo
+++ b/arch/arm64/Kconfig.turbo
@@ -84,4 +84,21 @@ config DYNAMIC_XCALL
 	  and a kernel module which provides customized
 	  implementation.
 
+config SMT_QOS
+	bool "Support userspace timer/wft to reduce intra-core contention"
+	depends on SCHED_SMT
+	depends on FAST_IRQ
+	depends on QOS_SCHED_DYNAMIC_AFFINITY
+	depends on CFS_BANDWIDTH && CGROUP_SCHED
+	select QOS_LEVEL
+	default y
+	help
+	  Cloud Service Provider deploy Best-Effort and Latency Sensitive tasks
+	  on the same physical core to maximize the resource utilization. We
+	  observe the LS task needs more cycles to complete the same workload
+	  due to the uarch resource contention. This feature control the
+	  instruction throughput of BS task into pipeline, so that the other
+	  SMT running LS task could occupy more uarch resources to reach better
+	  IPC.
+
 endmenu # "Turbo features selection"
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 250ef9a069c2..3e9f0b8070b8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9321,9 +9321,179 @@ static int wake_soft_domain(struct task_struct *p, int target)
 }
 #endif
 
+#ifdef CONFIG_SMT_QOS
+static DEFINE_PER_CPU_ALIGNED(cpumask_t, smt_prefer_cpus);
+static unsigned long numa_smt_util[MAX_NUMNODES];
+/*
+ * Target SMT sibling CPU utilization watermark.
+ * Default range: 0-100.
+ */
+static unsigned int sched_smt_offline_util_pct = 50;
+static cpumask_t master_smt_cpumask;
+static cpumask_t slave_smt_cpumask;
+
+static struct ctl_table smt_util_pct_sysctl_table[] = {
+	{
+		.procname	= "sched_smt_offline_util_pct",
+		.data		= &sched_smt_offline_util_pct,
+		.maxlen		= sizeof(sched_smt_offline_util_pct),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE_HUNDRED,
+	},
+	{}
+};
+
+static int __init sched_init_smt_qos(void)
+{
+	int cpu;
+
+	if (!sched_smt_active())
+		return 0;
+
+	register_sysctl_init("kernel", smt_util_pct_sysctl_table);
+
+	cpumask_copy(&master_smt_cpumask, cpu_possible_mask);
+	for_each_possible_cpu(cpu) {
+		if (cpu != cpumask_first(cpu_smt_mask(cpu)))
+			cpumask_clear_cpu(cpu, &master_smt_cpumask);
+	}
+
+	cpumask_andnot(&slave_smt_cpumask, cpu_possible_mask, &master_smt_cpumask);
+	pr_info("Master SMT mask: %*pbl\n", cpumask_pr_args(&master_smt_cpumask));
+	pr_info("Slave SMT mask: %*pbl\n", cpumask_pr_args(&slave_smt_cpumask));
+
+	return 0;
+}
+late_initcall(sched_init_smt_qos);
+
+static __always_inline bool smt_qos_enabled(void)
+{
+	return sched_smt_active() && sched_feat(SMT_TAG_PULL);
+}
+
+static inline void smt_qos_set_task_select_cpus(struct task_struct *p,
+						const cpumask_t **backup_select_cpus,
+						int *idlest_cpu, int prev_cpu)
+{
+	cpumask_t *prefer_cpus = this_cpu_ptr(&smt_prefer_cpus);
+	cpumask_t *prefer_cpumask = &master_smt_cpumask;
+
+	if (!smt_qos_enabled())
+		return;
+
+	if (task_group(p)->qos_level < QOS_LEVEL_ONLINE) {
+		unsigned long smt_util = numa_smt_util[cpu_to_node(prev_cpu)];
+
+		if (smt_util < sched_smt_offline_util_pct)
+			prefer_cpumask = &slave_smt_cpumask;
+	}
+
+	if (*idlest_cpu != -1 && !cpumask_test_cpu(*idlest_cpu, prefer_cpumask))
+		*idlest_cpu = -1;
+
+	cpumask_copy(prefer_cpus, task_prefer_cpus(p));
+	if (cpumask_empty(prefer_cpus))
+		cpumask_and(prefer_cpus, p->cpus_ptr, prefer_cpumask);
+	else
+		cpumask_and(prefer_cpus, prefer_cpus, prefer_cpumask);
+
+	*backup_select_cpus = p->select_cpus;
+	p->select_cpus = prefer_cpus;
+}
+
+static inline void smt_qos_restore_task_select_cpus(struct task_struct *p,
+						    const cpumask_t *backup_select_cpus)
+{
+	if (!smt_qos_enabled())
+		return;
+
+	p->select_cpus = backup_select_cpus;
+}
+
+static inline void smt_qos_update_qos_level(int cpu, struct task_struct *p)
+{
+	int new_status;
+
+	if (!smt_qos_enabled())
+		return;
+
+	new_status = p ? task_group(p)->qos_level : QOS_LEVEL_OFFLINE;
+
+	if (likely(new_status == __this_cpu_read(qos_smt_status)))
+		return;
+
+	__this_cpu_write(qos_smt_status, new_status);
+}
+
+static inline bool is_slave_to_master(int src_cpu, int dst_cpu)
+{
+	return !cpumask_test_cpu(src_cpu, &master_smt_cpumask) &&
+		cpumask_test_cpu(dst_cpu, &master_smt_cpumask);
+}
+
+static inline bool smt_qos_should_not_busiest(int src_cpu, int dst_cpu)
+{
+	if (!smt_qos_enabled())
+		return 0;
+
+	/*
+	 * Migration of tasks from SMT siblings to
+	 * the primary SMT CPU is restricted.
+	 */
+	return is_slave_to_master(src_cpu, dst_cpu);
+}
+
+static inline bool smt_qos_can_migrate_task(struct task_struct *p, int src_cpu,
+					    int dst_cpu)
+{
+	if (!smt_qos_enabled())
+		return 1;
+
+	/*
+	 * Only offline tasks are allowed to be migrated from
+	 * primary SMT CPUs to SMT siblings.
+	 */
+	if (cpumask_test_cpu(src_cpu, &master_smt_cpumask) &&
+	    !cpumask_test_cpu(dst_cpu, &master_smt_cpumask)) {
+		unsigned long smt_util;
+
+		if (task_group(p)->qos_level >= QOS_LEVEL_ONLINE)
+			return 0;
+
+		smt_util = numa_smt_util[cpu_to_node(dst_cpu)];
+		if (smt_util >= sched_smt_offline_util_pct)
+			return 0;
+	}
+
+	/*
+	 * Migration of tasks from SMT siblings to
+	 * the primary SMT CPU is restricted.
+	 */
+	return !is_slave_to_master(src_cpu, dst_cpu);
+}
+
+static inline void smt_qos_update_sd_ld_stats(struct sched_domain *sd, int dst_cpu,
+					      unsigned long total_smt_capacity,
+					      unsigned long total_smt_util)
+{
+	if (!smt_qos_enabled() || !total_smt_capacity)
+		return;
+
+	if (!(sd->flags & SD_NUMA) && (sd->parent && (sd->parent->flags & SD_NUMA)))
+		numa_smt_util[cpu_to_node(dst_cpu)] = (total_smt_util * 100) / total_smt_capacity;
+}
+#endif /* CONFIG_SMT_QOS */
+
 #ifdef CONFIG_QOS_SCHED
 static __always_inline bool qos_sched_enabled(void)
 {
+#ifdef CONFIG_SMT_QOS
+	if (sched_feat(SMT_TAG_PULL))
+		return false;
+#endif
+
 	return true;
 }
 #endif
@@ -9356,6 +9526,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 	struct sched_migrate_ctx ctx;
 	int ret;
 #endif
+#ifdef CONFIG_SMT_QOS
+	const cpumask_t *backup_select_cpus;
+#endif
 
 	time = schedstat_start_time();
 
@@ -9367,6 +9540,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
 	set_task_select_cpus(p, &idlest_cpu, sd_flag);
 #endif
+#ifdef CONFIG_SMT_QOS
+	smt_qos_set_task_select_cpus(p, &backup_select_cpus, &idlest_cpu, prev_cpu);
+#endif
 
 	if (wake_flags & WF_TTWU) {
 		record_wakee(p);
@@ -9461,6 +9637,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 		schedstat_inc(p->stats.nr_wakeups_force_preferred_cpus);
 	}
 #endif
+
+#ifdef CONFIG_SMT_QOS
+	smt_qos_restore_task_select_cpus(p, backup_select_cpus);
+#endif
 	return new_cpu;
 }
 
@@ -10377,6 +10557,9 @@ done: __maybe_unused;
 	qos_smt_expel(this_cpu, p);
 #endif
 
+#ifdef CONFIG_SMT_QOS
+	smt_qos_update_qos_level(rq->cpu, p);
+#endif
 	return p;
 
 idle:
@@ -10436,6 +10619,10 @@ done: __maybe_unused;
 	qos_smt_expel(this_cpu, NULL);
 #endif
 
+#ifdef CONFIG_SMT_QOS
+	smt_qos_update_qos_level(rq->cpu, NULL);
+#endif
+
 	return NULL;
 }
 
@@ -10862,6 +11049,11 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	}
 #endif
 
+#ifdef CONFIG_SMT_QOS
+	if (!smt_qos_can_migrate_task(p, env->src_cpu, env->dst_cpu))
+		return 0;
+#endif
+
 	/*
 	 * We do not migrate tasks that are:
 	 * 1) throttled_lb_pair, or
@@ -11494,6 +11686,10 @@ struct sd_lb_stats {
 
 	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
 	struct sg_lb_stats local_stat;	/* Statistics of the local group */
+#ifdef CONFIG_SMT_QOS
+	unsigned long total_smt_util;     /* Total utilization of all groups in sd */
+	unsigned long total_smt_capacity; /* Total capacity of all groups in sd */
+#endif
 };
 
 static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
@@ -11924,6 +12120,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		sgs->group_util += cpu_util_cfs(i);
 		sgs->group_runnable += cpu_runnable(rq);
 		sgs->sum_h_nr_running += rq->cfs.h_nr_running;
+#ifdef CONFIG_SMT_QOS
+		if (sched_smt_active() && !cpumask_test_cpu(i, &master_smt_cpumask)) {
+			sds->total_smt_util += cpu_util_cfs(i);
+			sds->total_smt_capacity += capacity_orig_of(i);
+		}
+#endif
 
 		nr_running = rq->nr_running;
 		sgs->sum_nr_running += nr_running;
@@ -12658,6 +12860,11 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 	}
 
 	update_idle_cpu_scan(env, sum_util);
+
+#ifdef CONFIG_SMT_QOS
+	smt_qos_update_sd_ld_stats(env->sd, env->dst_cpu, sds->total_smt_capacity,
+				   sds->total_smt_util);
+#endif
 }
 
 /**
@@ -13052,6 +13259,11 @@ static struct rq *find_busiest_queue(struct lb_env *env,
 		if (!nr_running)
 			continue;
 
+#ifdef CONFIG_SMT_QOS
+		if (smt_qos_should_not_busiest(i, env->dst_cpu))
+			continue;
+#endif
+
 		capacity = capacity_of(i);
 
 		/*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index c9ad8e72ecd0..446d136654d9 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -130,3 +130,7 @@ SCHED_FEAT(SOFT_QUOTA, false)
 #endif
 
 SCHED_FEAT(WA_SMT, false)
+
+#ifdef CONFIG_SMT_QOS
+SCHED_FEAT(SMT_TAG_PULL, false)
+#endif
-- 
2.34.1