[PATCH OLK-6.6 V1] sched: Support NUMA parallel scheduling for multiple processes

hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICBBNL -------------------------------- For architectures with multiple NUMA node levels and large distances between nodes, a better approach is to support processes running in parallel on each NUMA node. The usage is restricted to the following scenarios: 1. No CPU binding for user-space processes; 2. It is applicable to distributed applications, such as business architectures with one master and multiple slaves running in parallel; 3. The existing "qos dynamic affinity" and "qos smart grid" features must not be used simultaneously. Signed-off-by: Cheng Yu <serein.chengyu@huawei.com> --- arch/arm64/Kconfig | 1 + arch/arm64/configs/openeuler_defconfig | 1 + arch/arm64/include/asm/prefer_numa.h | 13 +++++ arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/prefer_numa.c | 68 ++++++++++++++++++++++++++ fs/proc/array.c | 3 -- include/linux/perf_event.h | 2 + include/linux/sched.h | 6 +++ init/Kconfig | 22 +++++++++ kernel/cgroup/cpuset.c | 8 ++- kernel/events/core.c | 13 +++++ kernel/fork.c | 11 ++--- kernel/sched/debug.c | 43 +++++++++++++++- kernel/sched/fair.c | 39 ++++++++++++--- kernel/sched/features.h | 4 ++ 15 files changed, 214 insertions(+), 21 deletions(-) create mode 100644 arch/arm64/include/asm/prefer_numa.h create mode 100644 arch/arm64/kernel/prefer_numa.c diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 5422d1502fd6..b1f550c8c82a 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -105,6 +105,7 @@ config ARM64 select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_SUPPORTS_SCHED_PARAL select ARCH_SUPPORTS_PAGE_TABLE_CHECK select ARCH_SUPPORTS_PER_VMA_LOCK select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 3cfff0701479..3d352fb1ae57 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -209,6 +209,7 @@ CONFIG_USER_NS=y CONFIG_PID_NS=y CONFIG_NET_NS=y CONFIG_SCHED_STEAL=y +CONFIG_SCHED_PARAL=y CONFIG_CHECKPOINT_RESTORE=y CONFIG_SCHED_AUTOGROUP=y CONFIG_RELAY=y diff --git a/arch/arm64/include/asm/prefer_numa.h b/arch/arm64/include/asm/prefer_numa.h new file mode 100644 index 000000000000..6c8e2b2142b9 --- /dev/null +++ b/arch/arm64/include/asm/prefer_numa.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef __ASM_PREFER_NUMA_H +#define __ASM_PREFER_NUMA_H + +#include <linux/sched.h> + +#define PROBE_NUMA_PMU_NAME "hisi_sccl3_hha0" +#define PROBE_NUMA_PMU_EVENT 0x02 + +void set_task_paral_node(struct task_struct *p); +int probe_pmu_numa_event(void); + +#endif /* __ASM_PREFER_NUMA_H */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 3d404a2cc961..b936be9d8baa 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -84,6 +84,7 @@ obj-$(CONFIG_IPI_AS_NMI) += ipi_nmi.o obj-$(CONFIG_HISI_VIRTCCA_GUEST) += virtcca_cvm_guest.o virtcca_cvm_tsi.o obj-$(CONFIG_HISI_VIRTCCA_HOST) += virtcca_cvm_host.o CFLAGS_patch-scs.o += -mbranch-protection=none +obj-$(CONFIG_SCHED_PARAL) += prefer_numa.o # Force dependency (vdso*-wrap.S includes vdso.so through incbin) $(obj)/vdso-wrap.o: $(obj)/vdso/vdso.so diff --git a/arch/arm64/kernel/prefer_numa.c b/arch/arm64/kernel/prefer_numa.c new file mode 100644 index 000000000000..e6f3f43fb97a --- /dev/null +++ b/arch/arm64/kernel/prefer_numa.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * choose a prefer numa node + * + * Copyright (C) 2025 Huawei Limited. + */ +#include <linux/perf_event.h> +#include <asm/prefer_numa.h> + +static atomic_t paral_nid_last = ATOMIC_INIT(-1); + +int probe_pmu_numa_event(void) +{ + struct perf_event *event; + struct perf_event_attr attr = {}; + int type = perf_pmu_type_of_name(PROBE_NUMA_PMU_NAME); + + if (type == -1) + return -EINVAL; + + attr.type = type; + attr.config = PROBE_NUMA_PMU_EVENT; + attr.size = sizeof(struct perf_event_attr); + attr.pinned = 1; + attr.disabled = 1; + attr.sample_period = 0; + + event = perf_event_create_kernel_counter(&attr, smp_processor_id(), + NULL, NULL, NULL); + if (IS_ERR(event)) + return PTR_ERR(event); + + perf_event_release_kernel(event); + + return 0; +} + +static inline unsigned int update_sched_paral_nid(void) +{ + return (unsigned int)atomic_inc_return(¶l_nid_last); +} + +void set_task_paral_node(struct task_struct *p) +{ + int nid; + int i = 0; + const cpumask_t *cpus_mask; + + if (is_global_init(current)) + return; + + if (p->flags & PF_KTHREAD || p->tgid != p->pid) + return; + + while (i < nr_node_ids) { + nid = update_sched_paral_nid() % nr_node_ids; + cpus_mask = cpumask_of_node(nid); + + if (cpumask_empty(cpus_mask) || + !cpumask_subset(cpus_mask, p->cpus_ptr)) { + i++; + continue; + } + + cpumask_copy(p->prefer_cpus, cpus_mask); + break; + } +} diff --git a/fs/proc/array.c b/fs/proc/array.c index a933a878df3c..6a4b0a850dce 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -439,9 +439,6 @@ __weak void arch_proc_pid_thread_features(struct seq_file *m, #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY static void task_cpus_preferred(struct seq_file *m, struct task_struct *task) { - if (!dynamic_affinity_enabled()) - return; - seq_printf(m, "Cpus_preferred:\t%*pb\n", cpumask_pr_args(task->prefer_cpus)); seq_printf(m, "Cpus_preferred_list:\t%*pbl\n", diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 826fb16906fe..14ed13f4b408 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1778,6 +1778,7 @@ extern void perf_event_task_tick(void); extern int perf_event_account_interrupt(struct perf_event *event); extern int perf_event_period(struct perf_event *event, u64 value); extern u64 perf_event_pause(struct perf_event *event, bool reset); +extern int perf_pmu_type_of_name(const char *name); #else /* !CONFIG_PERF_EVENTS: */ static inline void * perf_aux_output_begin(struct perf_output_handle *handle, @@ -1864,6 +1865,7 @@ static inline u64 perf_event_pause(struct perf_event *event, bool reset) { return 0; } +static inline int perf_pmu_type_of_name(const char *name) { return -1; } #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) diff --git a/include/linux/sched.h b/include/linux/sched.h index 3979c34e9b83..ee10780715f1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2627,6 +2627,12 @@ static inline bool dynamic_affinity_enabled(void) { return static_branch_unlikely(&__dynamic_affinity_switch); } + +#ifdef CONFIG_SCHED_PARAL +bool sched_paral_used(void); +#else +static inline bool sched_paral_used(void) { return false; } +#endif #endif #ifdef CONFIG_QOS_SCHED_SMART_GRID diff --git a/init/Kconfig b/init/Kconfig index c8bd58347a87..925e8517a7e8 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1484,6 +1484,28 @@ config SCHED_STEAL If unsure, say N here. +# +# For architectures that want to enable the support for SCHED_PARAL +# +config ARCH_SUPPORTS_SCHED_PARAL + bool + +config SCHED_PARAL + bool "Parallelly schedule processes on different NUMA nodes" + depends on ARCH_SUPPORTS_SCHED_PARAL + depends on QOS_SCHED_DYNAMIC_AFFINITY + default n + help + By enabling this feature, processes can be scheduled in parallel + on various NUMA nodes to better utilize the cache in NUMA node. + The usage is restricted to the following scenarios: + 1. No CPU binding is performed for user-space processes; + 2. It is applicable to distributed applications, such as business + architectures with one master and multiple slaves running in + parallel; + 3. The existing "qos dynamic affinity" and "qos smart grid" + features must not be used simultaneously. + config CHECKPOINT_RESTORE bool "Checkpoint/restore support" depends on PROC_FS diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 417827f2c043..c6a919592735 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3488,7 +3488,8 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY cpumask_copy(prefer_cpus_attach, cs->prefer_cpus); - set_prefer_cpus_ptr(task, prefer_cpus_attach); + if (!sched_paral_used() || !cpumask_empty(prefer_cpus_attach)) + set_prefer_cpus_ptr(task, prefer_cpus_attach); #endif cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); @@ -4348,7 +4349,10 @@ static void cpuset_fork(struct task_struct *task) set_cpus_allowed_ptr(task, current->cpus_ptr); #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY - set_prefer_cpus_ptr(task, current->prefer_cpus); + rcu_read_lock(); + if (!sched_paral_used() || !cpumask_empty(task_cs(current)->prefer_cpus)) + set_prefer_cpus_ptr(task, current->prefer_cpus); + rcu_read_unlock(); #endif task->mems_allowed = current->mems_allowed; return; diff --git a/kernel/events/core.c b/kernel/events/core.c index f042d6101932..99f46f6ea198 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -13855,6 +13855,19 @@ static int __init perf_event_sysfs_init(void) } device_initcall(perf_event_sysfs_init); +int perf_pmu_type_of_name(const char *name) +{ + unsigned int i; + struct pmu *pmu; + + idr_for_each_entry(&pmu_idr, pmu, i) { + if (!strcmp(pmu->name, name)) + return pmu->type; + } + + return -1; +} + #ifdef CONFIG_CGROUP_PERF static struct cgroup_subsys_state * perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) diff --git a/kernel/fork.c b/kernel/fork.c index 96c6a9e446ac..8b2ff47de685 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -631,8 +631,7 @@ void free_task(struct task_struct *tsk) free_kthread_struct(tsk); bpf_task_storage_free(tsk); #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY - if (dynamic_affinity_enabled()) - sched_prefer_cpus_free(tsk); + sched_prefer_cpus_free(tsk); #endif #ifdef CONFIG_QOS_SCHED_SMART_GRID if (smart_grid_enabled()) @@ -2451,11 +2450,9 @@ __latent_entropy struct task_struct *copy_process( #endif #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY - if (dynamic_affinity_enabled()) { - retval = sched_prefer_cpus_fork(p, current->prefer_cpus); - if (retval) - goto bad_fork_free; - } + retval = sched_prefer_cpus_fork(p, current->prefer_cpus); + if (retval) + goto bad_fork_free; #endif lockdep_assert_irqs_enabled(); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 7a9e6896c699..793019869da9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -7,6 +7,10 @@ * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar */ +#ifdef CONFIG_SCHED_PARAL +#include <asm/prefer_numa.h> +#endif + /* * This allows printing both to /proc/sched_debug and * to the console @@ -95,6 +99,39 @@ static void sched_feat_disable(int i) { }; static void sched_feat_enable(int i) { }; #endif /* CONFIG_JUMP_LABEL */ +#ifdef CONFIG_SCHED_PARAL +static void sched_feat_disable_paral(char *cmp) +{ + struct task_struct *tsk, *t; + + if (strncmp(cmp, "PARAL", 5) == 0) { + read_lock(&tasklist_lock); + for_each_process(tsk) { + if (tsk->flags & PF_KTHREAD || is_global_init(tsk)) + continue; + + for_each_thread(tsk, t) + cpumask_clear(t->prefer_cpus); + } + read_unlock(&tasklist_lock); + } +} + +static bool sched_feat_enable_paral(char *cmp) +{ + if (strncmp(cmp, "PARAL", 5) != 0) + return true; + + if (probe_pmu_numa_event() != 0) + return false; + + return true; +} +#else +static void sched_feat_disable_paral(char *cmp) {}; +static bool sched_feat_enable_paral(char *cmp) { return true; }; +#endif /* CONFIG_SCHED_PARAL */ + static int sched_feat_set(char *cmp) { int i; @@ -111,8 +148,12 @@ static int sched_feat_set(char *cmp) if (neg) { sysctl_sched_features &= ~(1UL << i); + sched_feat_disable_paral(cmp); sched_feat_disable(i); } else { + if (!sched_feat_enable_paral(cmp)) + return -EPERM; + sysctl_sched_features |= (1UL << i); sched_feat_enable(i); } @@ -1045,7 +1086,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P_SCHEDSTAT(nr_wakeups_passive); P_SCHEDSTAT(nr_wakeups_idle); #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY - if (dynamic_affinity_enabled()) { + if (dynamic_affinity_enabled() || sched_paral_used()) { P_SCHEDSTAT(nr_wakeups_preferred_cpus); P_SCHEDSTAT(nr_wakeups_force_preferred_cpus); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 71661d6c5b54..8a32d0ac4a8b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -75,6 +75,10 @@ #endif #include <linux/sched/grid_qos.h> +#ifdef CONFIG_SCHED_PARAL +#include <asm/prefer_numa.h> +#endif + /* * The initial- and re-scaling of tunables is configurable * @@ -9057,6 +9061,12 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) } #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +#ifdef CONFIG_SCHED_PARAL +bool sched_paral_used(void) +{ + return sched_feat(PARAL); +} +#endif DEFINE_STATIC_KEY_FALSE(__dynamic_affinity_switch); @@ -9084,16 +9094,15 @@ __setup("dynamic_affinity=", dynamic_affinity_switch_setup); static inline bool prefer_cpus_valid(struct task_struct *p) { - struct cpumask *prefer_cpus; + struct cpumask *prefer_cpus = task_prefer_cpus(p); - if (!dynamic_affinity_enabled()) - return false; - - prefer_cpus = task_prefer_cpus(p); + if (dynamic_affinity_enabled() || sched_paral_used()) { + return !cpumask_empty(prefer_cpus) && + !cpumask_equal(prefer_cpus, p->cpus_ptr) && + cpumask_subset(prefer_cpus, p->cpus_ptr); + } - return !cpumask_empty(prefer_cpus) && - !cpumask_equal(prefer_cpus, p->cpus_ptr) && - cpumask_subset(prefer_cpus, p->cpus_ptr); + return false; } static inline unsigned long taskgroup_cpu_util(struct task_group *tg, @@ -9193,6 +9202,14 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu, } rcu_read_unlock(); + /* In extreme cases, it may cause uneven system load. */ + if (sched_paral_used() && sysctl_sched_util_low_pct == 100 && nr_cpus_valid > 0) { + p->select_cpus = p->prefer_cpus; + if (sd_flag & SD_BALANCE_WAKE) + schedstat_inc(p->stats.nr_wakeups_preferred_cpus); + return; + } + /* * Follow cases should select cpus_ptr, checking by condition of * tg_capacity > nr_cpus_valid: @@ -14679,6 +14696,12 @@ static void task_fork_fair(struct task_struct *p) if (curr) update_curr(cfs_rq); place_entity(cfs_rq, se, ENQUEUE_INITIAL); + +#ifdef CONFIG_SCHED_PARAL + if (sched_paral_used()) + set_task_paral_node(p); +#endif + rq_unlock(rq, &rf); } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index ea7ba74810e3..67939d04542f 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -61,6 +61,10 @@ SCHED_FEAT(SIS_UTIL, true) SCHED_FEAT(STEAL, false) #endif +#ifdef CONFIG_SCHED_PARAL +SCHED_FEAT(PARAL, false) +#endif + /* * Issue a WARN when we do multiple update_rq_clock() calls * in a single rq->lock section. Default disabled because the -- 2.25.1

反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/17284 邮件列表地址:https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/OGE... FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/17284 Mailing list address: https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/OGE...
participants (2)
-
Cheng Yu
-
patchwork bot