[PATCH OLK-5.10 V1] sched: Support NUMA parallel scheduling for multiple processes

29 May 2025

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/ICBBNL

--------------------------------

For architectures with multiple NUMA node levels and large distances
between nodes, a better approach is to support processes running in
parallel on each NUMA node.

The usage is restricted to the following scenarios:
1. No CPU binding for user-space processes;
2. It is applicable to distributed applications, such as business
   architectures with one master and multiple slaves running in
   parallel;
3. The existing "qos dynamic affinity" and "qos smart grid" features
   must not be used simultaneously.

Signed-off-by: Cheng Yu <serein.chengyu@huawei.com>
---
 arch/arm64/Kconfig                     |  1 +
 arch/arm64/configs/openeuler_defconfig |  1 +
 arch/arm64/mm/numa.c                   | 74 ++++++++++++++++++++++++++
 include/linux/perf_event.h             |  2 +
 include/linux/sched.h                  |  3 ++
 init/Kconfig                           | 22 ++++++++
 kernel/cgroup/cpuset.c                 |  8 ++-
 kernel/events/core.c                   | 13 +++++
 kernel/sched/debug.c                   | 37 +++++++++++++
 kernel/sched/fair.c                    | 26 ++++++++-
 kernel/sched/features.h                |  4 ++
 11 files changed, 187 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 93ced97f8c6c..76f07a283d4e 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -79,6 +79,7 @@ config ARM64
 	select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && (GCC_VERSION >= 50000 || CC_IS_CLANG)
 	select ARCH_SUPPORTS_NUMA_BALANCING
 	select ARCH_SUPPORTS_SCHED_KEEP_ON_CORE
+	select ARCH_SUPPORTS_SCHED_PARAL
 	select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 	select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
 	select ARCH_WANT_DEFAULT_BPF_JIT
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig
index fb9f92d11bde..4dd4994d6fbb 100644
--- a/arch/arm64/configs/openeuler_defconfig
+++ b/arch/arm64/configs/openeuler_defconfig
@@ -190,6 +190,7 @@ CONFIG_PID_NS=y
 CONFIG_NET_NS=y
 CONFIG_SCHED_STEAL=y
 CONFIG_SCHED_KEEP_ON_CORE=y
+CONFIG_SCHED_PARAL=y
 CONFIG_CHECKPOINT_RESTORE=y
 CONFIG_SCHED_AUTOGROUP=y
 # CONFIG_SYSFS_DEPRECATED is not set
diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c
index 99a746e14f2b..6340513bab65 100644
--- a/arch/arm64/mm/numa.c
+++ b/arch/arm64/mm/numa.c
@@ -777,3 +777,77 @@ void __init arm64_numa_init(void)
 
 	numa_init(dummy_numa_init);
 }
+
+#ifdef CONFIG_SCHED_PARAL
+#include <linux/perf_event.h>
+
+static atomic_t paral_nid_last = ATOMIC_INIT(-1);
+
+int probe_pmu_numa_event(void)
+{
+	struct perf_event *event;
+	struct perf_event_attr attr = {};
+	int type = perf_pmu_type_of_name("hisi_sccl3_hha0");
+
+	if (type == -1)
+		return -EINVAL;
+
+	attr.type = type;
+	attr.config = 0x02;
+	attr.size = sizeof(struct perf_event_attr);
+	attr.pinned = 1;
+	attr.disabled = 1;
+	attr.sample_period = 0;
+
+	event = perf_event_create_kernel_counter(&attr, smp_processor_id(),
+							NULL, NULL, NULL);
+	if (IS_ERR(event))
+		return PTR_ERR(event);
+
+	perf_event_release_kernel(event);
+
+	return 0;
+}
+
+static inline int update_sched_paral_nid(void)
+{
+	int onid, nnid;
+
+	do {
+		onid = atomic_read(¶l_nid_last);
+		nnid = (onid >= INT_MAX) ? 0 : (onid + 1);
+	} while (atomic_cmpxchg(¶l_nid_last, onid, nnid) != onid);
+
+	return nnid;
+}
+
+void set_task_paral_node(struct task_struct *p)
+{
+	int nid;
+	int i = 0;
+	const cpumask_t *cpus_mask;
+
+	if (is_global_init(current))
+		return;
+
+	if (p->flags & PF_KTHREAD || p->tgid != p->pid)
+		return;
+
+	while (i < nr_node_ids) {
+		nid = update_sched_paral_nid() % nr_node_ids;
+		cpus_mask = cpumask_of_node(nid);
+
+		if (cpumask_empty(cpus_mask) ||
+			!cpumask_subset(cpus_mask, p->cpus_ptr)) {
+			i++;
+			continue;
+		}
+
+		cpumask_copy(p->prefer_cpus, cpus_mask);
+		break;
+	}
+}
+#else
+void set_task_paral_node(struct task_struct *p) {}
+int probe_pmu_numa_event(void) { return -1; }
+#endif
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2544bfdd948b..7814ff2e45c7 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1485,6 +1485,7 @@ extern void perf_event_task_tick(void);
 extern int perf_event_account_interrupt(struct perf_event *event);
 extern int perf_event_period(struct perf_event *event, u64 value);
 extern u64 perf_event_pause(struct perf_event *event, bool reset);
+extern int perf_pmu_type_of_name(const char *name);
 #else /* !CONFIG_PERF_EVENTS: */
 static inline void *
 perf_aux_output_begin(struct perf_output_handle *handle,
@@ -1577,6 +1578,7 @@ static inline u64 perf_event_pause(struct perf_event *event, bool reset)
 {
 	return 0;
 }
+static inline int perf_pmu_type_of_name(const char *name) { return -1; }
 #endif
 
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e3170b7f81fa..181230773350 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2309,6 +2309,9 @@ int set_prefer_cpus_ptr(struct task_struct *p,
 int sched_prefer_cpus_fork(struct task_struct *p, struct cpumask *mask);
 void sched_prefer_cpus_free(struct task_struct *p);
 void dynamic_affinity_enable(void);
+bool sched_paral_used(void);
+void set_task_paral_node(struct task_struct *p);
+int probe_pmu_numa_event(void);
 #endif
 
 #ifdef CONFIG_QOS_SCHED_SMART_GRID
diff --git a/init/Kconfig b/init/Kconfig
index 3a6a14e66acd..5f88cce193e8 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1389,6 +1389,28 @@ config SCHED_KEEP_ON_CORE
 	  otherwise the task will not be migrated and the cpu0 will still be
 	  used.
 
+#
+# For architectures that want to enable the support for SCHED_PARAL
+#
+config ARCH_SUPPORTS_SCHED_PARAL
+	bool
+
+config SCHED_PARAL
+	bool "Parallelly schedule processes on different NUMA nodes"
+	depends on ARCH_SUPPORTS_SCHED_PARAL
+	depends on QOS_SCHED_DYNAMIC_AFFINITY
+	default n
+	help
+	  By enabling this feature, processes can be scheduled in parallel
+	  on various NUMA nodes to better utilize the cache in NUMA node.
+	  The usage is restricted to the following scenarios:
+	  1. No CPU binding is performed for user-space processes;
+	  2. It is applicable to distributed applications, such as business
+	     architectures with one master and multiple slaves running in
+	     parallel;
+	  3. The existing "qos dynamic affinity" and "qos smart grid"
+	     features must not be used simultaneously.
+
 config CHECKPOINT_RESTORE
 	bool "Checkpoint/restore support"
 	select PROC_CHILDREN
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 7ecff06d2026..1f6ed08af9f5 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2423,7 +2423,8 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 		 */
 		WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
-		set_prefer_cpus_ptr(task, prefer_cpus_attach);
+		if (!sched_paral_used() || !cpumask_empty(prefer_cpus_attach))
+			set_prefer_cpus_ptr(task, prefer_cpus_attach);
 #endif
 
 		cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
@@ -3131,7 +3132,10 @@ static void cpuset_fork(struct task_struct *task)
 
 	set_cpus_allowed_ptr(task, current->cpus_ptr);
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
-	set_prefer_cpus_ptr(task, current->prefer_cpus);
+	rcu_read_lock();
+	if (!sched_paral_used() || !cpumask_empty(task_cs(current)->prefer_cpus))
+		set_prefer_cpus_ptr(task, current->prefer_cpus);
+	rcu_read_unlock();
 #endif
 	task->mems_allowed = current->mems_allowed;
 }
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b56b572f1bd0..e0c193083aa0 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -13167,6 +13167,19 @@ static int __init perf_event_sysfs_init(void)
 }
 device_initcall(perf_event_sysfs_init);
 
+int perf_pmu_type_of_name(const char *name)
+{
+	unsigned int i;
+	struct pmu *pmu;
+
+	idr_for_each_entry(&pmu_idr, pmu, i) {
+		if (!strcmp(pmu->name, name))
+			return pmu->type;
+	}
+
+	return -1;
+}
+
 #ifdef CONFIG_CGROUP_PERF
 static struct cgroup_subsys_state *
 perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4275398bc713..d6b53d54dcf5 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -96,6 +96,39 @@ static void sched_feat_disable(int i) { };
 static void sched_feat_enable(int i) { };
 #endif /* CONFIG_JUMP_LABEL */
 
+#ifdef CONFIG_SCHED_PARAL
+static void sched_feat_disable_paral(char *cmp)
+{
+	struct task_struct *tsk, *t;
+
+	if (strncmp(cmp, "PARAL", 5) == 0) {
+		read_lock(&tasklist_lock);
+		for_each_process(tsk) {
+			if (tsk->flags & PF_KTHREAD || is_global_init(tsk))
+				continue;
+
+			for_each_thread(tsk, t)
+				cpumask_clear(t->prefer_cpus);
+		}
+		read_unlock(&tasklist_lock);
+	}
+}
+
+static bool sched_feat_enable_paral(char *cmp)
+{
+	if (strncmp(cmp, "PARAL", 5) != 0)
+		return true;
+
+	if (probe_pmu_numa_event() != 0)
+		return false;
+
+	return true;
+}
+#else
+static void sched_feat_disable_paral(char *cmp) {};
+static bool sched_feat_enable_paral(char *cmp) { return true; };
+#endif /* CONFIG_SCHED_PARAL */
+
 static int sched_feat_set(char *cmp)
 {
 	int i;
@@ -112,8 +145,12 @@ static int sched_feat_set(char *cmp)
 
 	if (neg) {
 		sysctl_sched_features &= ~(1UL << i);
+		sched_feat_disable_paral(cmp);
 		sched_feat_disable(i);
 	} else {
+		if (!sched_feat_enable_paral(cmp))
+			return -EPERM;
+
 		sysctl_sched_features |= (1UL << i);
 		sched_feat_enable(i);
 	}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 437572c568ee..396c2b87c012 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8063,6 +8063,16 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 }
 
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+bool sched_paral_used(void)
+{
+#ifdef CONFIG_SCHED_PARAL
+	if (sched_feat(PARAL))
+		return true;
+#endif
+
+	return false;
+}
+
 static DEFINE_STATIC_KEY_FALSE(__dynamic_affinity_used);
 
 static __always_inline bool dynamic_affinity_used(void)
@@ -8168,6 +8178,14 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu,
 	}
 	rcu_read_unlock();
 
+	/* In extreme cases, it may cause uneven system load. */
+	if (sched_paral_used() && sysctl_sched_util_low_pct == 100 && nr_cpus_valid > 0) {
+		p->select_cpus = p->prefer_cpus;
+		if (sd_flag & SD_BALANCE_WAKE)
+			schedstat_inc(p->se.statistics.nr_wakeups_preferred_cpus);
+		return;
+	}
+
 	/*
 	 * Follow cases should select cpus_ptr, checking by condition of
 	 * tg_capacity > nr_cpus_valid:
@@ -8225,7 +8243,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 #endif
 
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
-	if (dynamic_affinity_used() || smart_grid_used())
+	if (dynamic_affinity_used() || smart_grid_used() || sched_paral_used())
 		set_task_select_cpus(p, &idlest_cpu, sd_flag);
 #endif
 
@@ -9867,7 +9885,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
 	p->select_cpus = p->cpus_ptr;
-	if (dynamic_affinity_used() || smart_grid_used())
+	if (dynamic_affinity_used() || smart_grid_used() || sched_paral_used())
 		set_task_select_cpus(p, NULL, 0);
 	if (!cpumask_test_cpu(env->dst_cpu, p->select_cpus)) {
 #else
@@ -13549,6 +13567,10 @@ static void task_fork_fair(struct task_struct *p)
 	}
 
 	se->vruntime -= cfs_rq->min_vruntime;
+
+	if (sched_paral_used())
+		set_task_paral_node(p);
+
 	rq_unlock(rq, &rf);
 }
 
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index fb885b20ba34..1fd89af55681 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -74,6 +74,10 @@ SCHED_FEAT(STEAL, false)
 SCHED_FEAT(KEEP_ON_CORE, false)
 #endif
 
+#ifdef CONFIG_SCHED_PARAL
+SCHED_FEAT(PARAL, false)
+#endif
+
 /*
  * Issue a WARN when we do multiple update_rq_clock() calls
  * in a single rq->lock section. Default disabled because the
-- 
2.25.1

    

Cheng Yu

patchwork bot

tags

participants (2)