*** BLURB HERE ***
Hui Tang (6): sched: Introduce dynamic affinity for cfs scheduler cpuset: Introduce new interface for scheduler dynamic affinity sched: Adjust wakeup cpu range according CPU util dynamicly sched: Adjust cpu range in load balance dynamicly sched: Add statistics for scheduler dynamic affinity config: enable CONFIG_QOS_SCHED_DYNAMIC_AFFINITY by default
arch/arm64/configs/hulk_defconfig | 1 + arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/hulk_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + include/linux/sched.h | 36 ++++++ include/linux/sched/sysctl.h | 4 + init/Kconfig | 10 ++ init/init_task.c | 3 + kernel/cgroup/cpuset.c | 151 ++++++++++++++++++++++- kernel/fork.c | 13 ++ kernel/sched/core.c | 103 ++++++++++++++++ kernel/sched/debug.c | 7 ++ kernel/sched/fair.c | 159 +++++++++++++++++++++++++ kernel/sysctl.c | 11 ++ 14 files changed, 500 insertions(+), 1 deletion(-)
From: Hui Tang tanghui20@huawei.com
hulk inclusion category: feature bugzilla: 187173, https://gitee.com/openeuler/kernel/issues/I5G4IH CVE: NA
--------------------------------
Dynamic affinity set preferred cpus for task. When the utilization of taskgroup's preferred cpu is low, task only run in cpus preferred to enhance cpu resource locality and reduce interference between task cgroups, otherwise task can burst preferred cpus to use external cpu within cpus allowed.
Signed-off-by: Hui Tang tanghui20@huawei.com Reviewed-by: Zhang Qiao zhangqiao22@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- init/Kconfig | 10 ++++++++++ 1 file changed, 10 insertions(+)
diff --git a/init/Kconfig b/init/Kconfig index ac1c864524ac..2c79f2e51fc4 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -823,6 +823,16 @@ config RT_GROUP_SCHED
endif #CGROUP_SCHED
+config QOS_SCHED_DYNAMIC_AFFINITY + bool "qos dynamic affinity" + depends on CPUSETS + default n + help + This feature lets you allocate preferred cpus to taskgroup. If enabled, + it will make taskgroup only to use preferred cpus when cpu utilization + of taskgroup is below threshold setted, otherwise make taskgroup to use + cpus allowed. + config CGROUP_PIDS bool "PIDs controller" help
From: Hui Tang tanghui20@huawei.com
hulk inclusion category: feature bugzilla: 187173, https://gitee.com/openeuler/kernel/issues/I5G4IH CVE: NA
--------------------------------
Add 'prefer_cpus' sysfs and related interface in cgroup cpuset.
Signed-off-by: Hui Tang tanghui20@huawei.com Reviewed-by: Zhang Qiao zhangqiao22@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- include/linux/sched.h | 17 +++++ init/init_task.c | 3 + kernel/cgroup/cpuset.c | 151 ++++++++++++++++++++++++++++++++++++++++- kernel/fork.c | 13 ++++ kernel/sched/core.c | 95 ++++++++++++++++++++++++++ 5 files changed, 278 insertions(+), 1 deletion(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h index ca020a991b33..718ec0289d83 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1247,7 +1247,16 @@ struct task_struct { #else KABI_RESERVE(5) #endif + +#if !defined(__GENKSYMS__) +#if defined(CONFIG_QOS_SCHED_DYNAMIC_AFFINITY) + cpumask_t *prefer_cpus; +#else + KABI_RESERVE(6) +#endif +#else KABI_RESERVE(6) +#endif KABI_RESERVE(7) KABI_RESERVE(8)
@@ -1964,4 +1973,12 @@ static inline int sched_qos_cpu_overload(void) } #endif
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +int dynamic_affinity_enabled(void); +int set_prefer_cpus_ptr(struct task_struct *p, + const struct cpumask *new_mask); +int sched_prefer_cpus_fork(struct task_struct *p, struct task_struct *orig); +void sched_prefer_cpus_free(struct task_struct *p); +#endif + #endif diff --git a/init/init_task.c b/init/init_task.c index 57ff82ab9811..b312a045f4b9 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -180,6 +180,9 @@ struct task_struct init_task #ifdef CONFIG_SECURITY .security = NULL, #endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + .prefer_cpus = NULL, +#endif #ifdef CONFIG_PID_RESERVE .fork_pid_union = { .fork_pid = 0, diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 3544ee391350..55bfbc4cdb16 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -104,6 +104,9 @@ struct cpuset { /* user-configured CPUs and Memory Nodes allow to tasks */ cpumask_var_t cpus_allowed; nodemask_t mems_allowed; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_var_t prefer_cpus; +#endif
/* effective CPUs and Memory Nodes allow to tasks */ cpumask_var_t effective_cpus; @@ -436,11 +439,22 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) goto free_cs; if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL)) goto free_cpus; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (!alloc_cpumask_var(&trial->prefer_cpus, GFP_KERNEL)) + goto free_prefer_cpus; +#endif
cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); cpumask_copy(trial->effective_cpus, cs->effective_cpus); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_copy(trial->prefer_cpus, cs->prefer_cpus); +#endif return trial;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +free_prefer_cpus: + free_cpumask_var(trial->effective_cpus); +#endif free_cpus: free_cpumask_var(trial->cpus_allowed); free_cs: @@ -456,6 +470,9 @@ static void free_trial_cpuset(struct cpuset *trial) { free_cpumask_var(trial->effective_cpus); free_cpumask_var(trial->cpus_allowed); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + free_cpumask_var(trial->prefer_cpus); +#endif kfree(trial); }
@@ -487,6 +504,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
rcu_read_lock();
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + ret = -EINVAL; + if (!cpumask_subset(cur->prefer_cpus, trial->cpus_allowed)) + goto out; +#endif /* Each of our child cpusets must be a subset of us */ ret = -EBUSY; cpuset_for_each_child(c, css, cur) @@ -551,6 +573,66 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) return ret; }
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +static cpumask_var_t prefer_cpus_attach; + +static void update_tasks_prefer_cpumask(struct cpuset *cs) +{ + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&cs->css, 0, &it); + while ((task = css_task_iter_next(&it))) + set_prefer_cpus_ptr(task, cs->prefer_cpus); + css_task_iter_end(&it); +} + +/* + * update_prefer_cpumask - update the prefer_cpus mask of a cpuset and + * all tasks in it + * @cs: the cpuset to consider + * @trialcs: trial cpuset + * @buf: buffer of cpu numbers written to this cpuset + */ +static int update_prefer_cpumask(struct cpuset *cs, struct cpuset *trialcs, + const char *buf) +{ + int retval; + + if (cs == &top_cpuset) + return -EACCES; + + /* + * An empty prefer_cpus is ok which mean that the cpuset tasks disable + * dynamic affinity feature. + * Since cpulist_parse() fails on an empty mask, we special case + * that parsing. + */ + if (!*buf) { + cpumask_clear(trialcs->prefer_cpus); + } else { + retval = cpulist_parse(buf, trialcs->prefer_cpus); + if (retval < 0) + return retval; + } + + /* Nothing to do if the cpus didn't change */ + if (cpumask_equal(cs->prefer_cpus, trialcs->prefer_cpus)) + return 0; + + if (!cpumask_subset(trialcs->prefer_cpus, cs->cpus_allowed)) + return -EINVAL; + + update_tasks_prefer_cpumask(trialcs); + + spin_lock_irq(&callback_lock); + cpumask_copy(cs->prefer_cpus, trialcs->prefer_cpus); + spin_unlock_irq(&callback_lock); + + return 0; +} +#endif + #ifdef CONFIG_SMP /* * Helper routine for generate_sched_domains(). @@ -1543,6 +1625,10 @@ static void cpuset_attach(struct cgroup_taskset *tset) else guarantee_online_cpus(cs, cpus_attach);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_copy(prefer_cpus_attach, cs->prefer_cpus); +#endif + guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
cgroup_taskset_for_each(task, css, tset) { @@ -1551,6 +1637,9 @@ static void cpuset_attach(struct cgroup_taskset *tset) * fail. TODO: have a better way to handle failure here */ WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + set_prefer_cpus_ptr(task, prefer_cpus_attach); +#endif
cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); cpuset_update_task_spread_flag(cs, task); @@ -1610,6 +1699,9 @@ typedef enum { FILE_MEMORY_PRESSURE, FILE_SPREAD_PAGE, FILE_SPREAD_SLAB, +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + FILE_DYNAMIC_CPULIST, +#endif } cpuset_filetype_t;
static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, @@ -1735,6 +1827,11 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, case FILE_MEMLIST: retval = update_nodemask(cs, trialcs, buf); break; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + case FILE_DYNAMIC_CPULIST: + retval = update_prefer_cpumask(cs, trialcs, buf); + break; +#endif default: retval = -EINVAL; break; @@ -1778,6 +1875,11 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) case FILE_EFFECTIVE_MEMLIST: seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems)); break; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + case FILE_DYNAMIC_CPULIST: + seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->prefer_cpus)); + break; +#endif default: ret = -EINVAL; } @@ -1935,7 +2037,15 @@ static struct cftype files[] = { .write_u64 = cpuset_write_u64, .private = FILE_MEMORY_PRESSURE_ENABLED, }, - +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + { + .name = "preferred_cpus", + .seq_show = cpuset_common_seq_show, + .write = cpuset_write_resmask, + .max_write_len = (100U + 6 * NR_CPUS), + .private = FILE_DYNAMIC_CPULIST, + }, +#endif { } /* terminate */ };
@@ -1959,17 +2069,28 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) goto free_cs; if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL)) goto free_cpus; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (!alloc_cpumask_var(&cs->prefer_cpus, GFP_KERNEL)) + goto free_effective_cpus; +#endif
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpumask_clear(cs->cpus_allowed); nodes_clear(cs->mems_allowed); cpumask_clear(cs->effective_cpus); nodes_clear(cs->effective_mems); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_clear(cs->prefer_cpus); +#endif fmeter_init(&cs->fmeter); cs->relax_domain_level = -1;
return &cs->css;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +free_effective_cpus: + free_cpumask_var(cs->effective_cpus); +#endif free_cpus: free_cpumask_var(cs->cpus_allowed); free_cs: @@ -2034,6 +2155,9 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->effective_mems = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); cpumask_copy(cs->effective_cpus, parent->cpus_allowed); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_copy(cs->prefer_cpus, parent->prefer_cpus); +#endif spin_unlock_irq(&callback_lock); out_unlock: mutex_unlock(&cpuset_mutex); @@ -2065,6 +2189,9 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + free_cpumask_var(cs->prefer_cpus); +#endif free_cpumask_var(cs->effective_cpus); free_cpumask_var(cs->cpus_allowed); kfree(cs); @@ -2099,6 +2226,9 @@ static void cpuset_fork(struct task_struct *task) return;
set_cpus_allowed_ptr(task, ¤t->cpus_allowed); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + set_prefer_cpus_ptr(task, current->prefer_cpus); +#endif task->mems_allowed = current->mems_allowed; }
@@ -2129,11 +2259,17 @@ int __init cpuset_init(void)
BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + BUG_ON(!alloc_cpumask_var(&top_cpuset.prefer_cpus, GFP_KERNEL)); +#endif
cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); cpumask_setall(top_cpuset.effective_cpus); nodes_setall(top_cpuset.effective_mems); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_clear(top_cpuset.prefer_cpus); +#endif
fmeter_init(&top_cpuset.fmeter); set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); @@ -2144,6 +2280,9 @@ int __init cpuset_init(void) return err;
BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + BUG_ON(!alloc_cpumask_var(&prefer_cpus_attach, GFP_KERNEL)); +#endif
return 0; } @@ -2180,6 +2319,9 @@ hotplug_update_tasks_legacy(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_t prefer_cpus; +#endif bool is_empty;
spin_lock_irq(&callback_lock); @@ -2198,6 +2340,13 @@ hotplug_update_tasks_legacy(struct cpuset *cs, if (mems_updated && !nodes_empty(cs->mems_allowed)) update_tasks_nodemask(cs);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (!cpumask_subset(cs->prefer_cpus, cs->cpus_allowed)) { + cpumask_and(&prefer_cpus, cs->prefer_cpus, cs->cpus_allowed); + cpumask_copy(cs->prefer_cpus, &prefer_cpus); + update_tasks_prefer_cpumask(cs); + } +#endif is_empty = cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed);
diff --git a/kernel/fork.c b/kernel/fork.c index 7608869f4f1e..4207d5e5958b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -459,6 +459,9 @@ void free_task(struct task_struct *tsk) arch_release_task_struct(tsk); if (tsk->flags & PF_KTHREAD) free_kthread_struct(tsk); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + sched_prefer_cpus_free(tsk); +#endif free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -888,6 +891,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->seccomp.filter = NULL; #endif
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + tsk->prefer_cpus = NULL; +#endif + setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); @@ -1862,6 +1869,12 @@ static __latent_entropy struct task_struct *copy_process( if (retval < 0) goto bad_fork_free;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + retval = sched_prefer_cpus_fork(p, current); + if (retval) + goto bad_fork_free; +#endif + /* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 36d7422da0ac..835f7c6c00ba 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7191,6 +7191,101 @@ static int __maybe_unused cpu_period_quota_parse(char *buf, return 0; }
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +int sched_prefer_cpus_fork(struct task_struct *p, struct task_struct *orig) +{ + p->prefer_cpus = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + if (!p->prefer_cpus) + return -ENOMEM; + + if (orig->prefer_cpus) + cpumask_copy(p->prefer_cpus, orig->prefer_cpus); + else + cpumask_clear(p->prefer_cpus); + + return 0; +} + +void sched_prefer_cpus_free(struct task_struct *p) +{ + kfree(p->prefer_cpus); +} + +static void do_set_prefer_cpus(struct task_struct *p, + const struct cpumask *new_mask) +{ + struct rq *rq = task_rq(p); + bool queued, running; + + lockdep_assert_held(&p->pi_lock); + + queued = task_on_rq_queued(p); + running = task_current(rq, p); + + if (queued) { + /* + * Because __kthread_bind() calls this on blocked tasks without + * holding rq->lock. + */ + lockdep_assert_held(&rq->lock); + dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); + } + if (running) + put_prev_task(rq, p); + + cpumask_copy(p->prefer_cpus, new_mask); + + if (queued) + enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); + if (running) + set_curr_task(rq, p); +} + +/* + * Change a given task's prefer CPU affinity. Prioritize migrate the thread to + * prefer cpus according to preferred bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +static int __set_prefer_cpus_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check) +{ + struct rq_flags rf; + struct rq *rq; + int ret = 0; + + if (unlikely(!p->prefer_cpus)) + return -EINVAL; + + rq = task_rq_lock(p, &rf); + update_rq_clock(rq); + + if (cpumask_equal(p->prefer_cpus, new_mask)) + goto out; + + if (!cpumask_subset(new_mask, &p->cpus_allowed)) { + ret = -EINVAL; + goto out; + } + + do_set_prefer_cpus(p, new_mask); +out: + task_rq_unlock(rq, p, &rf); + + return ret; +} + +int set_prefer_cpus_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ + if (p->sched_class != &fair_sched_class) + return 0; + + return __set_prefer_cpus_ptr(p, new_mask, false); +} +#endif + #ifdef CONFIG_CFS_BANDWIDTH static int cpu_max_show(struct seq_file *sf, void *v) {
From: Hui Tang tanghui20@huawei.com
hulk inclusion category: feature bugzilla: 187173, https://gitee.com/openeuler/kernel/issues/I5G4IH CVE: NA
--------------------------------
Compare taskgroup 'util_avg' in perferred cpu with capacity preferred cpu, dynamicly adjust cpu range for task wakeup process.
Signed-off-by: Hui Tang tanghui20@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Reviewed-by: Zhang Qiao zhangqiao22@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- include/linux/sched.h | 4 +- include/linux/sched/sysctl.h | 4 + kernel/sched/fair.c | 144 +++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 11 +++ 4 files changed, 162 insertions(+), 1 deletion(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 718ec0289d83..8f27fa3e5622 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1251,13 +1251,15 @@ struct task_struct { #if !defined(__GENKSYMS__) #if defined(CONFIG_QOS_SCHED_DYNAMIC_AFFINITY) cpumask_t *prefer_cpus; + const cpumask_t *select_cpus; #else KABI_RESERVE(6) + KABI_RESERVE(7) #endif #else KABI_RESERVE(6) -#endif KABI_RESERVE(7) +#endif KABI_RESERVE(8)
/* CPU-specific state of this task: */ diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index b277fbc807ec..04eb5b127867 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -32,6 +32,10 @@ extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +extern int sysctl_sched_util_low_pct; +#endif + enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, SCHED_TUNABLESCALING_LOG, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7d553a4c5120..407bceee1126 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1775,6 +1775,9 @@ static void task_numa_compare(struct task_numa_env *env, * can be used from IRQ context. */ local_irq_disable(); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + env->p->select_cpus = &env->p->cpus_allowed; +#endif env->dst_cpu = select_idle_sibling(env->p, env->src_cpu, env->dst_cpu); local_irq_enable(); @@ -5955,8 +5958,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int i;
/* Skip over this group if it has no CPUs allowed */ +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (!cpumask_intersects(sched_group_span(group), + p->select_cpus)) +#else if (!cpumask_intersects(sched_group_span(group), &p->cpus_allowed)) +#endif continue;
local_group = cpumask_test_cpu(this_cpu, @@ -6088,7 +6096,11 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this return cpumask_first(sched_group_span(group));
/* Traverse only the allowed CPUs */ +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + for_each_cpu_and(i, sched_group_span(group), p->select_cpus) { +#else for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) { +#endif if (sched_idle_cpu(i)) return i;
@@ -6131,7 +6143,11 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p { int new_cpu = cpu;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (!cpumask_intersects(sched_domain_span(sd), p->select_cpus)) +#else if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) +#endif return prev_cpu;
/* @@ -6248,7 +6264,11 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int if (!test_idle_cores(target, false)) return -1;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_and(cpus, sched_domain_span(sd), p->select_cpus); +#else cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); +#endif
for_each_cpu_wrap(core, cpus, target) { bool idle = true; @@ -6282,8 +6302,13 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t return -1;
for_each_cpu(cpu, cpu_smt_mask(target)) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (!cpumask_test_cpu(cpu, p->select_cpus) || + !cpumask_test_cpu(cpu, sched_domain_span(sd))) +#else if (!cpumask_test_cpu(cpu, &p->cpus_allowed) || !cpumask_test_cpu(cpu, sched_domain_span(sd))) +#endif continue; if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) return cpu; @@ -6344,7 +6369,11 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
time = local_clock();
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_and(cpus, sched_domain_span(sd), p->select_cpus); +#else cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); +#endif
for_each_cpu_wrap(cpu, cpus, target) { if (!--nr) @@ -6383,7 +6412,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) struct sched_domain *sd; int i, recent_used_cpu;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if ((available_idle_cpu(target) || sched_idle_cpu(target)) && + cpumask_test_cpu(target, p->select_cpus)) { +#else if (available_idle_cpu(target) || sched_idle_cpu(target)) { +#endif SET_STAT(found_idle_cpu_easy); return target; } @@ -6391,8 +6425,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) /* * If the previous CPU is cache affine and idle, don't be stupid: */ +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY if (prev != target && cpus_share_cache(prev, target) && + cpumask_test_cpu(prev, p->select_cpus) && (available_idle_cpu(prev) || sched_idle_cpu(prev))) { +#else + if (prev != target && cpus_share_cache(prev, target) && + (available_idle_cpu(prev) || sched_idle_cpu(prev))) { +#endif SET_STAT(found_idle_cpu_easy); return prev; } @@ -6403,7 +6443,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_test_cpu(p->recent_used_cpu, p->select_cpus)) { +#else cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { +#endif /* * Replace recent_used_cpu with prev as it is a potential * candidate for the next wake: @@ -6605,7 +6649,85 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) sync_entity_load_avg(&p->se);
return min_cap * 1024 < task_util(p) * capacity_margin; + +} + +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +/* + * Low utilization threshold for CPU + * + * (default: 85%), units: percentage of CPU utilization) + */ +int sysctl_sched_util_low_pct = 85; + +static inline bool prefer_cpus_valid(struct task_struct *p) +{ + return p->prefer_cpus && + !cpumask_empty(p->prefer_cpus) && + !cpumask_equal(p->prefer_cpus, &p->cpus_allowed) && + cpumask_subset(p->prefer_cpus, &p->cpus_allowed); +} + +/* + * set_task_select_cpus: select the cpu range for task + * @p: the task whose available cpu range will to set + * @idlest_cpu: the cpu which is the idlest in prefer cpus + * + * If sum of 'util_avg' among 'preferred_cpus' lower than the percentage + * 'sysctl_sched_util_low_pct' of 'preferred_cpus' capacity, select + * 'preferred_cpus' range for task, otherwise select 'preferred_cpus' for task. + * + * The available cpu range set to p->select_cpus. Idlest cpu in preferred cpus + * set to @idlest_cpu, which is set to wakeup cpu when fast path wakeup cpu + * without p->select_cpus. + */ +static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu, + int sd_flag) +{ + unsigned long util_avg_sum = 0; + unsigned long tg_capacity = 0; + long min_util = INT_MIN; + struct task_group *tg; + long spare; + int cpu; + + p->select_cpus = &p->cpus_allowed; + if (!prefer_cpus_valid(p)) + return; + + rcu_read_lock(); + tg = task_group(p); + for_each_cpu(cpu, p->prefer_cpus) { + if (unlikely(!tg->se[cpu])) + continue; + + if (idlest_cpu && available_idle_cpu(cpu)) { + *idlest_cpu = cpu; + } else if (idlest_cpu) { + spare = (long)(capacity_of(cpu) - tg->se[cpu]->avg.util_avg); + if (spare > min_util) { + min_util = spare; + *idlest_cpu = cpu; + } + } + + if (available_idle_cpu(cpu)) { + rcu_read_unlock(); + p->select_cpus = p->prefer_cpus; + return; + } + + util_avg_sum += tg->se[cpu]->avg.util_avg; + tg_capacity += capacity_of(cpu); + } + rcu_read_unlock(); + + if (tg_capacity > cpumask_weight(p->prefer_cpus) && + util_avg_sum * 100 <= tg_capacity * sysctl_sched_util_low_pct) { + p->select_cpus = p->prefer_cpus; + } } +#endif
/* * select_task_rq_fair: Select target runqueue for the waking task in domains @@ -6628,13 +6750,24 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int new_cpu = prev_cpu; int want_affine = 0; int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + int idlest_cpu = 0; +#endif
time = schedstat_start_time();
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + set_task_select_cpus(p, &idlest_cpu, sd_flag); +#endif + if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + && cpumask_test_cpu(cpu, p->select_cpus); +#else && cpumask_test_cpu(cpu, &p->cpus_allowed); +#endif }
rcu_read_lock(); @@ -6648,7 +6781,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f */ if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + new_cpu = cpu; + if (cpu != prev_cpu && + cpumask_test_cpu(prev_cpu, p->select_cpus)) +#else if (cpu != prev_cpu) +#endif new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
sd = NULL; /* Prefer wake_affine over balance flags */ @@ -6673,6 +6812,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f current->recent_used_cpu = cpu; } rcu_read_unlock(); + +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (!cpumask_test_cpu(new_cpu, p->select_cpus)) + new_cpu = idlest_cpu; +#endif schedstat_end_time(cpu_rq(cpu), time);
return new_cpu; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 345d4a14ce6d..ad62ea156afd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1325,6 +1325,17 @@ static struct ctl_table kern_table[] = { .extra1 = &one_hundred, .extra2 = &one_thousand, }, +#endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + { + .procname = "sched_util_low_pct", + .data = &sysctl_sched_util_low_pct, + .maxlen = sizeof(sysctl_sched_util_low_pct), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one_hundred, + }, #endif { } };
From: Hui Tang tanghui20@huawei.com
hulk inclusion category: feature bugzilla: 187173, https://gitee.com/openeuler/kernel/issues/I5G4IH CVE: NA
--------------------------------
Signed-off-by: Hui Tang tanghui20@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Reviewed-by: Zhang Qiao zhangqiao22@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- kernel/sched/fair.c | 9 +++++++++ 1 file changed, 9 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 407bceee1126..7833ef8f32f4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7789,7 +7789,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) return 0;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + set_task_select_cpus(p, NULL, 0); + if (!cpumask_test_cpu(env->dst_cpu, p->select_cpus)) { +#else if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) { +#endif int cpu;
schedstat_inc(p->se.statistics.nr_failed_migrations_affine); @@ -7809,7 +7814,11 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/* Prevent to re-select dst_cpu via env's CPUs: */ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (cpumask_test_cpu(cpu, p->select_cpus)) { +#else if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { +#endif env->flags |= LBF_DST_PINNED; env->new_dst_cpu = cpu; break;
From: Hui Tang tanghui20@huawei.com
hulk inclusion category: feature bugzilla: 187173, https://gitee.com/openeuler/kernel/issues/I5G4IH CVE: NA
--------------------------------
Signed-off-by: Hui Tang tanghui20@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Reviewed-by: Zhang Qiao zhangqiao22@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- include/linux/sched.h | 17 +++++++++++++++++ kernel/sched/core.c | 8 ++++++++ kernel/sched/debug.c | 7 +++++++ kernel/sched/fair.c | 8 +++++++- 4 files changed, 39 insertions(+), 1 deletion(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 8f27fa3e5622..928186f16100 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -444,6 +444,15 @@ struct sched_statistics { #endif };
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +struct dyn_affinity_stats { +#ifdef CONFIG_SCHEDSTATS + u64 nr_wakeups_preferred_cpus; + u64 nr_wakeups_force_preferred_cpus; +#endif +}; +#endif + struct sched_entity { /* For load-balancing: */ struct load_weight load; @@ -480,7 +489,15 @@ struct sched_entity { struct sched_avg avg; #endif
+#if !defined(__GENKSYMS__) +#if defined(CONFIG_QOS_SCHED_DYNAMIC_AFFINITY) + struct dyn_affinity_stats *dyn_affi_stats; +#else + KABI_RESERVE(1) +#endif +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 835f7c6c00ba..970616070da8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7203,12 +7203,20 @@ int sched_prefer_cpus_fork(struct task_struct *p, struct task_struct *orig) else cpumask_clear(p->prefer_cpus);
+ p->se.dyn_affi_stats = kzalloc(sizeof(struct dyn_affinity_stats), + GFP_KERNEL); + if (!p->se.dyn_affi_stats) { + kfree(p->prefer_cpus); + p->prefer_cpus = NULL; + return -ENOMEM; + } return 0; }
void sched_prefer_cpus_free(struct task_struct *p) { kfree(p->prefer_cpus); + kfree(p->se.dyn_affi_stats); }
static void do_set_prefer_cpus(struct task_struct *p, diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index fcf2a07ece05..bcdfdaae3b73 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -925,6 +925,9 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, struct seq_file *m) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + struct dyn_affinity_stats *dyn_affi = p->se.dyn_affi_stats; +#endif unsigned long nr_switches;
SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), @@ -983,6 +986,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts); P_SCHEDSTAT(se.statistics.nr_wakeups_passive); P_SCHEDSTAT(se.statistics.nr_wakeups_idle); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + __P(dyn_affi->nr_wakeups_preferred_cpus); + __P(dyn_affi->nr_wakeups_force_preferred_cpus); +#endif
avg_atom = p->se.sum_exec_runtime; if (nr_switches) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7833ef8f32f4..bcc72537b6fa 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6714,6 +6714,8 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu, if (available_idle_cpu(cpu)) { rcu_read_unlock(); p->select_cpus = p->prefer_cpus; + if (sd_flag & SD_BALANCE_WAKE) + schedstat_inc(p->se.dyn_affi_stats->nr_wakeups_preferred_cpus); return; }
@@ -6725,6 +6727,8 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu, if (tg_capacity > cpumask_weight(p->prefer_cpus) && util_avg_sum * 100 <= tg_capacity * sysctl_sched_util_low_pct) { p->select_cpus = p->prefer_cpus; + if (sd_flag & SD_BALANCE_WAKE) + schedstat_inc(p->se.dyn_affi_stats->nr_wakeups_preferred_cpus); } } #endif @@ -6814,8 +6818,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f rcu_read_unlock();
#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY - if (!cpumask_test_cpu(new_cpu, p->select_cpus)) + if (!cpumask_test_cpu(new_cpu, p->select_cpus)) { new_cpu = idlest_cpu; + schedstat_inc(p->se.dyn_affi_stats->nr_wakeups_force_preferred_cpus); + } #endif schedstat_end_time(cpu_rq(cpu), time);
From: Hui Tang tanghui20@huawei.com
hulk inclusion category: feature bugzilla: 187173, https://gitee.com/openeuler/kernel/issues/I5G4IH CVE: NA
--------------------------------
Signed-off-by: Hui Tang tanghui20@huawei.com Reviewed-by: Zhang Qiao zhangqiao22@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- arch/arm64/configs/hulk_defconfig | 1 + arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/hulk_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + 4 files changed, 4 insertions(+)
diff --git a/arch/arm64/configs/hulk_defconfig b/arch/arm64/configs/hulk_defconfig index f3a15e856b5f..d0c20b1b24a9 100644 --- a/arch/arm64/configs/hulk_defconfig +++ b/arch/arm64/configs/hulk_defconfig @@ -125,6 +125,7 @@ CONFIG_BLK_CGROUP=y CONFIG_CGROUP_WRITEBACK=y CONFIG_CGROUP_SCHED=y CONFIG_FAIR_GROUP_SCHED=y +CONFIG_QOS_SCHED_DYNAMIC_AFFINITY=y CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y CONFIG_CGROUP_PIDS=y diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index ec18673c4048..95225e2dbe52 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -137,6 +137,7 @@ CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_PERF=y CONFIG_CGROUP_BPF=y CONFIG_QOS_SCHED=y +CONFIG_QOS_SCHED_DYNAMIC_AFFINITY=y # CONFIG_CGROUP_DEBUG is not set CONFIG_SOCK_CGROUP_DATA=y CONFIG_CGROUP_FILES=y diff --git a/arch/x86/configs/hulk_defconfig b/arch/x86/configs/hulk_defconfig index 737250b21197..bdd662877601 100644 --- a/arch/x86/configs/hulk_defconfig +++ b/arch/x86/configs/hulk_defconfig @@ -135,6 +135,7 @@ CONFIG_BLK_CGROUP=y # CONFIG_DEBUG_BLK_CGROUP is not set CONFIG_CGROUP_WRITEBACK=y CONFIG_CGROUP_SCHED=y +CONFIG_QOS_SCHED_DYNAMIC_AFFINITY=y CONFIG_FAIR_GROUP_SCHED=y CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 64f0805a5e96..1de117ae0932 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -131,6 +131,7 @@ CONFIG_CGROUP_WRITEBACK=y CONFIG_CGROUP_SCHED=y CONFIG_FAIR_GROUP_SCHED=y CONFIG_QOS_SCHED=y +CONFIG_QOS_SCHED_DYNAMIC_AFFINITY=y CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y CONFIG_CGROUP_PIDS=y