[PATCH OLK-6.6 V1] sched: Support NUMA parallel scheduling for multiple processes

25 Jul 2025

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/ICBBNL

--------------------------------

For architectures with multiple NUMA node levels and large distances
between nodes, a better approach is to support processes running in
parallel on each NUMA node.

The usage is restricted to the following scenarios:
1. No CPU binding for user-space processes;
2. It is applicable to distributed applications, such as business
   architectures with one master and multiple slaves running in
   parallel;
3. The existing "qos dynamic affinity" and "qos smart grid" features
   must not be used simultaneously.

Signed-off-by: Cheng Yu <serein.chengyu@huawei.com>
---
 arch/arm64/Kconfig                     |  1 +
 arch/arm64/configs/openeuler_defconfig |  1 +
 arch/arm64/include/asm/prefer_numa.h   | 13 +++++
 arch/arm64/kernel/Makefile             |  1 +
 arch/arm64/kernel/prefer_numa.c        | 68 ++++++++++++++++++++++++++
 fs/proc/array.c                        |  3 --
 include/linux/perf_event.h             |  2 +
 include/linux/sched.h                  |  6 +++
 init/Kconfig                           | 22 +++++++++
 kernel/cgroup/cpuset.c                 |  8 ++-
 kernel/events/core.c                   | 13 +++++
 kernel/fork.c                          | 11 ++---
 kernel/sched/debug.c                   | 43 +++++++++++++++-
 kernel/sched/fair.c                    | 39 ++++++++++++---
 kernel/sched/features.h                |  4 ++
 15 files changed, 214 insertions(+), 21 deletions(-)
 create mode 100644 arch/arm64/include/asm/prefer_numa.h
 create mode 100644 arch/arm64/kernel/prefer_numa.c

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 5422d1502fd6..b1f550c8c82a 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -105,6 +105,7 @@ config ARM64
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
 	select ARCH_SUPPORTS_NUMA_BALANCING
+	select ARCH_SUPPORTS_SCHED_PARAL
 	select ARCH_SUPPORTS_PAGE_TABLE_CHECK
 	select ARCH_SUPPORTS_PER_VMA_LOCK
 	select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig
index 3cfff0701479..3d352fb1ae57 100644
--- a/arch/arm64/configs/openeuler_defconfig
+++ b/arch/arm64/configs/openeuler_defconfig
@@ -209,6 +209,7 @@ CONFIG_USER_NS=y
 CONFIG_PID_NS=y
 CONFIG_NET_NS=y
 CONFIG_SCHED_STEAL=y
+CONFIG_SCHED_PARAL=y
 CONFIG_CHECKPOINT_RESTORE=y
 CONFIG_SCHED_AUTOGROUP=y
 CONFIG_RELAY=y
diff --git a/arch/arm64/include/asm/prefer_numa.h b/arch/arm64/include/asm/prefer_numa.h
new file mode 100644
index 000000000000..6c8e2b2142b9
--- /dev/null
+++ b/arch/arm64/include/asm/prefer_numa.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef __ASM_PREFER_NUMA_H
+#define __ASM_PREFER_NUMA_H
+
+#include <linux/sched.h>
+
+#define PROBE_NUMA_PMU_NAME  "hisi_sccl3_hha0"
+#define PROBE_NUMA_PMU_EVENT 0x02
+
+void set_task_paral_node(struct task_struct *p);
+int probe_pmu_numa_event(void);
+
+#endif	/* __ASM_PREFER_NUMA_H */
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 3d404a2cc961..b936be9d8baa 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -84,6 +84,7 @@ obj-$(CONFIG_IPI_AS_NMI)		+= ipi_nmi.o
 obj-$(CONFIG_HISI_VIRTCCA_GUEST)	+= virtcca_cvm_guest.o virtcca_cvm_tsi.o
 obj-$(CONFIG_HISI_VIRTCCA_HOST)		+= virtcca_cvm_host.o
 CFLAGS_patch-scs.o			+= -mbranch-protection=none
+obj-$(CONFIG_SCHED_PARAL)		+= prefer_numa.o
 
 # Force dependency (vdso*-wrap.S includes vdso.so through incbin)
 $(obj)/vdso-wrap.o: $(obj)/vdso/vdso.so
diff --git a/arch/arm64/kernel/prefer_numa.c b/arch/arm64/kernel/prefer_numa.c
new file mode 100644
index 000000000000..e6f3f43fb97a
--- /dev/null
+++ b/arch/arm64/kernel/prefer_numa.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * choose a prefer numa node
+ *
+ * Copyright (C) 2025 Huawei Limited.
+ */
+#include <linux/perf_event.h>
+#include <asm/prefer_numa.h>
+
+static atomic_t paral_nid_last = ATOMIC_INIT(-1);
+
+int probe_pmu_numa_event(void)
+{
+	struct perf_event *event;
+	struct perf_event_attr attr = {};
+	int type = perf_pmu_type_of_name(PROBE_NUMA_PMU_NAME);
+
+	if (type == -1)
+		return -EINVAL;
+
+	attr.type = type;
+	attr.config = PROBE_NUMA_PMU_EVENT;
+	attr.size = sizeof(struct perf_event_attr);
+	attr.pinned = 1;
+	attr.disabled = 1;
+	attr.sample_period = 0;
+
+	event = perf_event_create_kernel_counter(&attr, smp_processor_id(),
+							NULL, NULL, NULL);
+	if (IS_ERR(event))
+		return PTR_ERR(event);
+
+	perf_event_release_kernel(event);
+
+	return 0;
+}
+
+static inline unsigned int update_sched_paral_nid(void)
+{
+	return (unsigned int)atomic_inc_return(¶l_nid_last);
+}
+
+void set_task_paral_node(struct task_struct *p)
+{
+	int nid;
+	int i = 0;
+	const cpumask_t *cpus_mask;
+
+	if (is_global_init(current))
+		return;
+
+	if (p->flags & PF_KTHREAD || p->tgid != p->pid)
+		return;
+
+	while (i < nr_node_ids) {
+		nid = update_sched_paral_nid() % nr_node_ids;
+		cpus_mask = cpumask_of_node(nid);
+
+		if (cpumask_empty(cpus_mask) ||
+			!cpumask_subset(cpus_mask, p->cpus_ptr)) {
+			i++;
+			continue;
+		}
+
+		cpumask_copy(p->prefer_cpus, cpus_mask);
+		break;
+	}
+}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index a933a878df3c..6a4b0a850dce 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -439,9 +439,6 @@ __weak void arch_proc_pid_thread_features(struct seq_file *m,
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
 static void task_cpus_preferred(struct seq_file *m, struct task_struct *task)
 {
-	if (!dynamic_affinity_enabled())
-		return;
-
 	seq_printf(m, "Cpus_preferred:\t%*pb\n",
 		   cpumask_pr_args(task->prefer_cpus));
 	seq_printf(m, "Cpus_preferred_list:\t%*pbl\n",
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 826fb16906fe..14ed13f4b408 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1778,6 +1778,7 @@ extern void perf_event_task_tick(void);
 extern int perf_event_account_interrupt(struct perf_event *event);
 extern int perf_event_period(struct perf_event *event, u64 value);
 extern u64 perf_event_pause(struct perf_event *event, bool reset);
+extern int perf_pmu_type_of_name(const char *name);
 #else /* !CONFIG_PERF_EVENTS: */
 static inline void *
 perf_aux_output_begin(struct perf_output_handle *handle,
@@ -1864,6 +1865,7 @@ static inline u64 perf_event_pause(struct perf_event *event, bool reset)
 {
 	return 0;
 }
+static inline int perf_pmu_type_of_name(const char *name) { return -1; }
 #endif
 
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3979c34e9b83..ee10780715f1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2627,6 +2627,12 @@ static inline bool dynamic_affinity_enabled(void)
 {
 	return static_branch_unlikely(&__dynamic_affinity_switch);
 }
+
+#ifdef CONFIG_SCHED_PARAL
+bool sched_paral_used(void);
+#else
+static inline bool sched_paral_used(void) { return false; }
+#endif
 #endif
 
 #ifdef CONFIG_QOS_SCHED_SMART_GRID
diff --git a/init/Kconfig b/init/Kconfig
index c8bd58347a87..925e8517a7e8 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1484,6 +1484,28 @@ config SCHED_STEAL
 
 	  If unsure, say N here.
 
+#
+# For architectures that want to enable the support for SCHED_PARAL
+#
+config ARCH_SUPPORTS_SCHED_PARAL
+	bool
+
+config SCHED_PARAL
+	bool "Parallelly schedule processes on different NUMA nodes"
+	depends on ARCH_SUPPORTS_SCHED_PARAL
+	depends on QOS_SCHED_DYNAMIC_AFFINITY
+	default n
+	help
+	  By enabling this feature, processes can be scheduled in parallel
+	  on various NUMA nodes to better utilize the cache in NUMA node.
+	  The usage is restricted to the following scenarios:
+	  1. No CPU binding is performed for user-space processes;
+	  2. It is applicable to distributed applications, such as business
+	     architectures with one master and multiple slaves running in
+	     parallel;
+	  3. The existing "qos dynamic affinity" and "qos smart grid"
+	     features must not be used simultaneously.
+
 config CHECKPOINT_RESTORE
 	bool "Checkpoint/restore support"
 	depends on PROC_FS
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 417827f2c043..c6a919592735 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -3488,7 +3488,8 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
 	WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
 	cpumask_copy(prefer_cpus_attach, cs->prefer_cpus);
-	set_prefer_cpus_ptr(task, prefer_cpus_attach);
+	if (!sched_paral_used() || !cpumask_empty(prefer_cpus_attach))
+		set_prefer_cpus_ptr(task, prefer_cpus_attach);
 #endif
 
 	cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
@@ -4348,7 +4349,10 @@ static void cpuset_fork(struct task_struct *task)
 
 		set_cpus_allowed_ptr(task, current->cpus_ptr);
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
-		set_prefer_cpus_ptr(task, current->prefer_cpus);
+		rcu_read_lock();
+		if (!sched_paral_used() || !cpumask_empty(task_cs(current)->prefer_cpus))
+			set_prefer_cpus_ptr(task, current->prefer_cpus);
+		rcu_read_unlock();
 #endif
 		task->mems_allowed = current->mems_allowed;
 		return;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f042d6101932..99f46f6ea198 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -13855,6 +13855,19 @@ static int __init perf_event_sysfs_init(void)
 }
 device_initcall(perf_event_sysfs_init);
 
+int perf_pmu_type_of_name(const char *name)
+{
+	unsigned int i;
+	struct pmu *pmu;
+
+	idr_for_each_entry(&pmu_idr, pmu, i) {
+		if (!strcmp(pmu->name, name))
+			return pmu->type;
+	}
+
+	return -1;
+}
+
 #ifdef CONFIG_CGROUP_PERF
 static struct cgroup_subsys_state *
 perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
diff --git a/kernel/fork.c b/kernel/fork.c
index 96c6a9e446ac..8b2ff47de685 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -631,8 +631,7 @@ void free_task(struct task_struct *tsk)
 		free_kthread_struct(tsk);
 	bpf_task_storage_free(tsk);
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
-	if (dynamic_affinity_enabled())
-		sched_prefer_cpus_free(tsk);
+	sched_prefer_cpus_free(tsk);
 #endif
 #ifdef CONFIG_QOS_SCHED_SMART_GRID
 	if (smart_grid_enabled())
@@ -2451,11 +2450,9 @@ __latent_entropy struct task_struct *copy_process(
 #endif
 
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
-	if (dynamic_affinity_enabled()) {
-		retval = sched_prefer_cpus_fork(p, current->prefer_cpus);
-		if (retval)
-			goto bad_fork_free;
-	}
+	retval = sched_prefer_cpus_fork(p, current->prefer_cpus);
+	if (retval)
+		goto bad_fork_free;
 #endif
 
 	lockdep_assert_irqs_enabled();
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 7a9e6896c699..793019869da9 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -7,6 +7,10 @@
  * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
  */
 
+#ifdef CONFIG_SCHED_PARAL
+#include <asm/prefer_numa.h>
+#endif
+
 /*
  * This allows printing both to /proc/sched_debug and
  * to the console
@@ -95,6 +99,39 @@ static void sched_feat_disable(int i) { };
 static void sched_feat_enable(int i) { };
 #endif /* CONFIG_JUMP_LABEL */
 
+#ifdef CONFIG_SCHED_PARAL
+static void sched_feat_disable_paral(char *cmp)
+{
+	struct task_struct *tsk, *t;
+
+	if (strncmp(cmp, "PARAL", 5) == 0) {
+		read_lock(&tasklist_lock);
+		for_each_process(tsk) {
+			if (tsk->flags & PF_KTHREAD || is_global_init(tsk))
+				continue;
+
+			for_each_thread(tsk, t)
+				cpumask_clear(t->prefer_cpus);
+		}
+		read_unlock(&tasklist_lock);
+	}
+}
+
+static bool sched_feat_enable_paral(char *cmp)
+{
+	if (strncmp(cmp, "PARAL", 5) != 0)
+		return true;
+
+	if (probe_pmu_numa_event() != 0)
+		return false;
+
+	return true;
+}
+#else
+static void sched_feat_disable_paral(char *cmp) {};
+static bool sched_feat_enable_paral(char *cmp) { return true; };
+#endif /* CONFIG_SCHED_PARAL */
+
 static int sched_feat_set(char *cmp)
 {
 	int i;
@@ -111,8 +148,12 @@ static int sched_feat_set(char *cmp)
 
 	if (neg) {
 		sysctl_sched_features &= ~(1UL << i);
+		sched_feat_disable_paral(cmp);
 		sched_feat_disable(i);
 	} else {
+		if (!sched_feat_enable_paral(cmp))
+			return -EPERM;
+
 		sysctl_sched_features |= (1UL << i);
 		sched_feat_enable(i);
 	}
@@ -1045,7 +1086,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
 		P_SCHEDSTAT(nr_wakeups_passive);
 		P_SCHEDSTAT(nr_wakeups_idle);
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
-		if (dynamic_affinity_enabled()) {
+		if (dynamic_affinity_enabled() || sched_paral_used()) {
 			P_SCHEDSTAT(nr_wakeups_preferred_cpus);
 			P_SCHEDSTAT(nr_wakeups_force_preferred_cpus);
 		}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 71661d6c5b54..8a32d0ac4a8b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -75,6 +75,10 @@
 #endif
 #include <linux/sched/grid_qos.h>
 
+#ifdef CONFIG_SCHED_PARAL
+#include <asm/prefer_numa.h>
+#endif
+
 /*
  * The initial- and re-scaling of tunables is configurable
  *
@@ -9057,6 +9061,12 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 }
 
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+#ifdef CONFIG_SCHED_PARAL
+bool sched_paral_used(void)
+{
+	return sched_feat(PARAL);
+}
+#endif
 
 DEFINE_STATIC_KEY_FALSE(__dynamic_affinity_switch);
 
@@ -9084,16 +9094,15 @@ __setup("dynamic_affinity=", dynamic_affinity_switch_setup);
 
 static inline bool prefer_cpus_valid(struct task_struct *p)
 {
-	struct cpumask *prefer_cpus;
+	struct cpumask *prefer_cpus = task_prefer_cpus(p);
 
-	if (!dynamic_affinity_enabled())
-		return false;
-
-	prefer_cpus = task_prefer_cpus(p);
+	if (dynamic_affinity_enabled() || sched_paral_used()) {
+		return !cpumask_empty(prefer_cpus) &&
+			!cpumask_equal(prefer_cpus, p->cpus_ptr) &&
+			cpumask_subset(prefer_cpus, p->cpus_ptr);
+	}
 
-	return !cpumask_empty(prefer_cpus) &&
-	       !cpumask_equal(prefer_cpus, p->cpus_ptr) &&
-	       cpumask_subset(prefer_cpus, p->cpus_ptr);
+	return false;
 }
 
 static inline unsigned long taskgroup_cpu_util(struct task_group *tg,
@@ -9193,6 +9202,14 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu,
 	}
 	rcu_read_unlock();
 
+	/* In extreme cases, it may cause uneven system load. */
+	if (sched_paral_used() && sysctl_sched_util_low_pct == 100 && nr_cpus_valid > 0) {
+		p->select_cpus = p->prefer_cpus;
+		if (sd_flag & SD_BALANCE_WAKE)
+			schedstat_inc(p->stats.nr_wakeups_preferred_cpus);
+		return;
+	}
+
 	/*
 	 * Follow cases should select cpus_ptr, checking by condition of
 	 * tg_capacity > nr_cpus_valid:
@@ -14679,6 +14696,12 @@ static void task_fork_fair(struct task_struct *p)
 	if (curr)
 		update_curr(cfs_rq);
 	place_entity(cfs_rq, se, ENQUEUE_INITIAL);
+
+#ifdef CONFIG_SCHED_PARAL
+	if (sched_paral_used())
+		set_task_paral_node(p);
+#endif
+
 	rq_unlock(rq, &rf);
 }
 
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index ea7ba74810e3..67939d04542f 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -61,6 +61,10 @@ SCHED_FEAT(SIS_UTIL, true)
 SCHED_FEAT(STEAL, false)
 #endif
 
+#ifdef CONFIG_SCHED_PARAL
+SCHED_FEAT(PARAL, false)
+#endif
+
 /*
  * Issue a WARN when we do multiple update_rq_clock() calls
  * in a single rq->lock section. Default disabled because the
-- 
2.25.1

    

Cheng Yu

patchwork bot

tags

participants (2)