[PATCH OLK-5.10 V1] sched: Support NUMA parallel scheduling for multiple processes

hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICBBNL -------------------------------- For architectures with multiple NUMA node levels and large distances between nodes, a better approach is to support processes running in parallel on each NUMA node. The usage is restricted to the following scenarios: 1. No CPU binding for user-space processes; 2. It is applicable to distributed applications, such as business architectures with one master and multiple slaves running in parallel; 3. The existing "qos dynamic affinity" and "qos smart grid" features must not be used simultaneously. Signed-off-by: Cheng Yu <serein.chengyu@huawei.com> --- arch/arm64/Kconfig | 1 + arch/arm64/configs/openeuler_defconfig | 1 + arch/arm64/mm/numa.c | 74 ++++++++++++++++++++++++++ include/linux/perf_event.h | 2 + include/linux/sched.h | 3 ++ init/Kconfig | 22 ++++++++ kernel/cgroup/cpuset.c | 8 ++- kernel/events/core.c | 13 +++++ kernel/sched/debug.c | 37 +++++++++++++ kernel/sched/fair.c | 26 ++++++++- kernel/sched/features.h | 4 ++ 11 files changed, 187 insertions(+), 4 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 93ced97f8c6c..76f07a283d4e 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -79,6 +79,7 @@ config ARM64 select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && (GCC_VERSION >= 50000 || CC_IS_CLANG) select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_SUPPORTS_SCHED_KEEP_ON_CORE + select ARCH_SUPPORTS_SCHED_PARAL select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT select ARCH_WANT_DEFAULT_BPF_JIT diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index fb9f92d11bde..4dd4994d6fbb 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -190,6 +190,7 @@ CONFIG_PID_NS=y CONFIG_NET_NS=y CONFIG_SCHED_STEAL=y CONFIG_SCHED_KEEP_ON_CORE=y +CONFIG_SCHED_PARAL=y CONFIG_CHECKPOINT_RESTORE=y CONFIG_SCHED_AUTOGROUP=y # CONFIG_SYSFS_DEPRECATED is not set diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c index 99a746e14f2b..6340513bab65 100644 --- a/arch/arm64/mm/numa.c +++ b/arch/arm64/mm/numa.c @@ -777,3 +777,77 @@ void __init arm64_numa_init(void) numa_init(dummy_numa_init); } + +#ifdef CONFIG_SCHED_PARAL +#include <linux/perf_event.h> + +static atomic_t paral_nid_last = ATOMIC_INIT(-1); + +int probe_pmu_numa_event(void) +{ + struct perf_event *event; + struct perf_event_attr attr = {}; + int type = perf_pmu_type_of_name("hisi_sccl3_hha0"); + + if (type == -1) + return -EINVAL; + + attr.type = type; + attr.config = 0x02; + attr.size = sizeof(struct perf_event_attr); + attr.pinned = 1; + attr.disabled = 1; + attr.sample_period = 0; + + event = perf_event_create_kernel_counter(&attr, smp_processor_id(), + NULL, NULL, NULL); + if (IS_ERR(event)) + return PTR_ERR(event); + + perf_event_release_kernel(event); + + return 0; +} + +static inline int update_sched_paral_nid(void) +{ + int onid, nnid; + + do { + onid = atomic_read(¶l_nid_last); + nnid = (onid >= INT_MAX) ? 0 : (onid + 1); + } while (atomic_cmpxchg(¶l_nid_last, onid, nnid) != onid); + + return nnid; +} + +void set_task_paral_node(struct task_struct *p) +{ + int nid; + int i = 0; + const cpumask_t *cpus_mask; + + if (is_global_init(current)) + return; + + if (p->flags & PF_KTHREAD || p->tgid != p->pid) + return; + + while (i < nr_node_ids) { + nid = update_sched_paral_nid() % nr_node_ids; + cpus_mask = cpumask_of_node(nid); + + if (cpumask_empty(cpus_mask) || + !cpumask_subset(cpus_mask, p->cpus_ptr)) { + i++; + continue; + } + + cpumask_copy(p->prefer_cpus, cpus_mask); + break; + } +} +#else +void set_task_paral_node(struct task_struct *p) {} +int probe_pmu_numa_event(void) { return -1; } +#endif diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2544bfdd948b..7814ff2e45c7 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1485,6 +1485,7 @@ extern void perf_event_task_tick(void); extern int perf_event_account_interrupt(struct perf_event *event); extern int perf_event_period(struct perf_event *event, u64 value); extern u64 perf_event_pause(struct perf_event *event, bool reset); +extern int perf_pmu_type_of_name(const char *name); #else /* !CONFIG_PERF_EVENTS: */ static inline void * perf_aux_output_begin(struct perf_output_handle *handle, @@ -1577,6 +1578,7 @@ static inline u64 perf_event_pause(struct perf_event *event, bool reset) { return 0; } +static inline int perf_pmu_type_of_name(const char *name) { return -1; } #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) diff --git a/include/linux/sched.h b/include/linux/sched.h index e3170b7f81fa..181230773350 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2309,6 +2309,9 @@ int set_prefer_cpus_ptr(struct task_struct *p, int sched_prefer_cpus_fork(struct task_struct *p, struct cpumask *mask); void sched_prefer_cpus_free(struct task_struct *p); void dynamic_affinity_enable(void); +bool sched_paral_used(void); +void set_task_paral_node(struct task_struct *p); +int probe_pmu_numa_event(void); #endif #ifdef CONFIG_QOS_SCHED_SMART_GRID diff --git a/init/Kconfig b/init/Kconfig index 3a6a14e66acd..5f88cce193e8 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1389,6 +1389,28 @@ config SCHED_KEEP_ON_CORE otherwise the task will not be migrated and the cpu0 will still be used. +# +# For architectures that want to enable the support for SCHED_PARAL +# +config ARCH_SUPPORTS_SCHED_PARAL + bool + +config SCHED_PARAL + bool "Parallelly schedule processes on different NUMA nodes" + depends on ARCH_SUPPORTS_SCHED_PARAL + depends on QOS_SCHED_DYNAMIC_AFFINITY + default n + help + By enabling this feature, processes can be scheduled in parallel + on various NUMA nodes to better utilize the cache in NUMA node. + The usage is restricted to the following scenarios: + 1. No CPU binding is performed for user-space processes; + 2. It is applicable to distributed applications, such as business + architectures with one master and multiple slaves running in + parallel; + 3. The existing "qos dynamic affinity" and "qos smart grid" + features must not be used simultaneously. + config CHECKPOINT_RESTORE bool "Checkpoint/restore support" select PROC_CHILDREN diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 7ecff06d2026..1f6ed08af9f5 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2423,7 +2423,8 @@ static void cpuset_attach(struct cgroup_taskset *tset) */ WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY - set_prefer_cpus_ptr(task, prefer_cpus_attach); + if (!sched_paral_used() || !cpumask_empty(prefer_cpus_attach)) + set_prefer_cpus_ptr(task, prefer_cpus_attach); #endif cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); @@ -3131,7 +3132,10 @@ static void cpuset_fork(struct task_struct *task) set_cpus_allowed_ptr(task, current->cpus_ptr); #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY - set_prefer_cpus_ptr(task, current->prefer_cpus); + rcu_read_lock(); + if (!sched_paral_used() || !cpumask_empty(task_cs(current)->prefer_cpus)) + set_prefer_cpus_ptr(task, current->prefer_cpus); + rcu_read_unlock(); #endif task->mems_allowed = current->mems_allowed; } diff --git a/kernel/events/core.c b/kernel/events/core.c index b56b572f1bd0..e0c193083aa0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -13167,6 +13167,19 @@ static int __init perf_event_sysfs_init(void) } device_initcall(perf_event_sysfs_init); +int perf_pmu_type_of_name(const char *name) +{ + unsigned int i; + struct pmu *pmu; + + idr_for_each_entry(&pmu_idr, pmu, i) { + if (!strcmp(pmu->name, name)) + return pmu->type; + } + + return -1; +} + #ifdef CONFIG_CGROUP_PERF static struct cgroup_subsys_state * perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4275398bc713..d6b53d54dcf5 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -96,6 +96,39 @@ static void sched_feat_disable(int i) { }; static void sched_feat_enable(int i) { }; #endif /* CONFIG_JUMP_LABEL */ +#ifdef CONFIG_SCHED_PARAL +static void sched_feat_disable_paral(char *cmp) +{ + struct task_struct *tsk, *t; + + if (strncmp(cmp, "PARAL", 5) == 0) { + read_lock(&tasklist_lock); + for_each_process(tsk) { + if (tsk->flags & PF_KTHREAD || is_global_init(tsk)) + continue; + + for_each_thread(tsk, t) + cpumask_clear(t->prefer_cpus); + } + read_unlock(&tasklist_lock); + } +} + +static bool sched_feat_enable_paral(char *cmp) +{ + if (strncmp(cmp, "PARAL", 5) != 0) + return true; + + if (probe_pmu_numa_event() != 0) + return false; + + return true; +} +#else +static void sched_feat_disable_paral(char *cmp) {}; +static bool sched_feat_enable_paral(char *cmp) { return true; }; +#endif /* CONFIG_SCHED_PARAL */ + static int sched_feat_set(char *cmp) { int i; @@ -112,8 +145,12 @@ static int sched_feat_set(char *cmp) if (neg) { sysctl_sched_features &= ~(1UL << i); + sched_feat_disable_paral(cmp); sched_feat_disable(i); } else { + if (!sched_feat_enable_paral(cmp)) + return -EPERM; + sysctl_sched_features |= (1UL << i); sched_feat_enable(i); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 437572c568ee..396c2b87c012 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8063,6 +8063,16 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) } #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +bool sched_paral_used(void) +{ +#ifdef CONFIG_SCHED_PARAL + if (sched_feat(PARAL)) + return true; +#endif + + return false; +} + static DEFINE_STATIC_KEY_FALSE(__dynamic_affinity_used); static __always_inline bool dynamic_affinity_used(void) @@ -8168,6 +8178,14 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu, } rcu_read_unlock(); + /* In extreme cases, it may cause uneven system load. */ + if (sched_paral_used() && sysctl_sched_util_low_pct == 100 && nr_cpus_valid > 0) { + p->select_cpus = p->prefer_cpus; + if (sd_flag & SD_BALANCE_WAKE) + schedstat_inc(p->se.statistics.nr_wakeups_preferred_cpus); + return; + } + /* * Follow cases should select cpus_ptr, checking by condition of * tg_capacity > nr_cpus_valid: @@ -8225,7 +8243,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f #endif #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY - if (dynamic_affinity_used() || smart_grid_used()) + if (dynamic_affinity_used() || smart_grid_used() || sched_paral_used()) set_task_select_cpus(p, &idlest_cpu, sd_flag); #endif @@ -9867,7 +9885,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY p->select_cpus = p->cpus_ptr; - if (dynamic_affinity_used() || smart_grid_used()) + if (dynamic_affinity_used() || smart_grid_used() || sched_paral_used()) set_task_select_cpus(p, NULL, 0); if (!cpumask_test_cpu(env->dst_cpu, p->select_cpus)) { #else @@ -13549,6 +13567,10 @@ static void task_fork_fair(struct task_struct *p) } se->vruntime -= cfs_rq->min_vruntime; + + if (sched_paral_used()) + set_task_paral_node(p); + rq_unlock(rq, &rf); } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index fb885b20ba34..1fd89af55681 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -74,6 +74,10 @@ SCHED_FEAT(STEAL, false) SCHED_FEAT(KEEP_ON_CORE, false) #endif +#ifdef CONFIG_SCHED_PARAL +SCHED_FEAT(PARAL, false) +#endif + /* * Issue a WARN when we do multiple update_rq_clock() calls * in a single rq->lock section. Default disabled because the -- 2.25.1

反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/16529 邮件列表地址:https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/PJO... FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/16529 Mailing list address: https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/PJO...
participants (2)
-
Cheng Yu
-
patchwork bot