From: Hui Tang tanghui20@huawei.com
hulk inclusion category: feature bugzilla: 187173, https://gitee.com/openeuler/kernel/issues/I5G4IH CVE: NA
--------------------------------
Add 'prefer_cpus' sysfs and related interface in cgroup cpuset.
Signed-off-by: Hui Tang tanghui20@huawei.com Reviewed-by: Zhang Qiao zhangqiao22@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- include/linux/sched.h | 17 +++++ init/init_task.c | 3 + kernel/cgroup/cpuset.c | 151 ++++++++++++++++++++++++++++++++++++++++- kernel/fork.c | 13 ++++ kernel/sched/core.c | 95 ++++++++++++++++++++++++++ 5 files changed, 278 insertions(+), 1 deletion(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h index ca020a991b33..718ec0289d83 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1247,7 +1247,16 @@ struct task_struct { #else KABI_RESERVE(5) #endif + +#if !defined(__GENKSYMS__) +#if defined(CONFIG_QOS_SCHED_DYNAMIC_AFFINITY) + cpumask_t *prefer_cpus; +#else + KABI_RESERVE(6) +#endif +#else KABI_RESERVE(6) +#endif KABI_RESERVE(7) KABI_RESERVE(8)
@@ -1964,4 +1973,12 @@ static inline int sched_qos_cpu_overload(void) } #endif
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +int dynamic_affinity_enabled(void); +int set_prefer_cpus_ptr(struct task_struct *p, + const struct cpumask *new_mask); +int sched_prefer_cpus_fork(struct task_struct *p, struct task_struct *orig); +void sched_prefer_cpus_free(struct task_struct *p); +#endif + #endif diff --git a/init/init_task.c b/init/init_task.c index 57ff82ab9811..b312a045f4b9 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -180,6 +180,9 @@ struct task_struct init_task #ifdef CONFIG_SECURITY .security = NULL, #endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + .prefer_cpus = NULL, +#endif #ifdef CONFIG_PID_RESERVE .fork_pid_union = { .fork_pid = 0, diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 3544ee391350..55bfbc4cdb16 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -104,6 +104,9 @@ struct cpuset { /* user-configured CPUs and Memory Nodes allow to tasks */ cpumask_var_t cpus_allowed; nodemask_t mems_allowed; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_var_t prefer_cpus; +#endif
/* effective CPUs and Memory Nodes allow to tasks */ cpumask_var_t effective_cpus; @@ -436,11 +439,22 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) goto free_cs; if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL)) goto free_cpus; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (!alloc_cpumask_var(&trial->prefer_cpus, GFP_KERNEL)) + goto free_prefer_cpus; +#endif
cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); cpumask_copy(trial->effective_cpus, cs->effective_cpus); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_copy(trial->prefer_cpus, cs->prefer_cpus); +#endif return trial;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +free_prefer_cpus: + free_cpumask_var(trial->effective_cpus); +#endif free_cpus: free_cpumask_var(trial->cpus_allowed); free_cs: @@ -456,6 +470,9 @@ static void free_trial_cpuset(struct cpuset *trial) { free_cpumask_var(trial->effective_cpus); free_cpumask_var(trial->cpus_allowed); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + free_cpumask_var(trial->prefer_cpus); +#endif kfree(trial); }
@@ -487,6 +504,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
rcu_read_lock();
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + ret = -EINVAL; + if (!cpumask_subset(cur->prefer_cpus, trial->cpus_allowed)) + goto out; +#endif /* Each of our child cpusets must be a subset of us */ ret = -EBUSY; cpuset_for_each_child(c, css, cur) @@ -551,6 +573,66 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) return ret; }
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +static cpumask_var_t prefer_cpus_attach; + +static void update_tasks_prefer_cpumask(struct cpuset *cs) +{ + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&cs->css, 0, &it); + while ((task = css_task_iter_next(&it))) + set_prefer_cpus_ptr(task, cs->prefer_cpus); + css_task_iter_end(&it); +} + +/* + * update_prefer_cpumask - update the prefer_cpus mask of a cpuset and + * all tasks in it + * @cs: the cpuset to consider + * @trialcs: trial cpuset + * @buf: buffer of cpu numbers written to this cpuset + */ +static int update_prefer_cpumask(struct cpuset *cs, struct cpuset *trialcs, + const char *buf) +{ + int retval; + + if (cs == &top_cpuset) + return -EACCES; + + /* + * An empty prefer_cpus is ok which mean that the cpuset tasks disable + * dynamic affinity feature. + * Since cpulist_parse() fails on an empty mask, we special case + * that parsing. + */ + if (!*buf) { + cpumask_clear(trialcs->prefer_cpus); + } else { + retval = cpulist_parse(buf, trialcs->prefer_cpus); + if (retval < 0) + return retval; + } + + /* Nothing to do if the cpus didn't change */ + if (cpumask_equal(cs->prefer_cpus, trialcs->prefer_cpus)) + return 0; + + if (!cpumask_subset(trialcs->prefer_cpus, cs->cpus_allowed)) + return -EINVAL; + + update_tasks_prefer_cpumask(trialcs); + + spin_lock_irq(&callback_lock); + cpumask_copy(cs->prefer_cpus, trialcs->prefer_cpus); + spin_unlock_irq(&callback_lock); + + return 0; +} +#endif + #ifdef CONFIG_SMP /* * Helper routine for generate_sched_domains(). @@ -1543,6 +1625,10 @@ static void cpuset_attach(struct cgroup_taskset *tset) else guarantee_online_cpus(cs, cpus_attach);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_copy(prefer_cpus_attach, cs->prefer_cpus); +#endif + guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
cgroup_taskset_for_each(task, css, tset) { @@ -1551,6 +1637,9 @@ static void cpuset_attach(struct cgroup_taskset *tset) * fail. TODO: have a better way to handle failure here */ WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + set_prefer_cpus_ptr(task, prefer_cpus_attach); +#endif
cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); cpuset_update_task_spread_flag(cs, task); @@ -1610,6 +1699,9 @@ typedef enum { FILE_MEMORY_PRESSURE, FILE_SPREAD_PAGE, FILE_SPREAD_SLAB, +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + FILE_DYNAMIC_CPULIST, +#endif } cpuset_filetype_t;
static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, @@ -1735,6 +1827,11 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, case FILE_MEMLIST: retval = update_nodemask(cs, trialcs, buf); break; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + case FILE_DYNAMIC_CPULIST: + retval = update_prefer_cpumask(cs, trialcs, buf); + break; +#endif default: retval = -EINVAL; break; @@ -1778,6 +1875,11 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) case FILE_EFFECTIVE_MEMLIST: seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems)); break; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + case FILE_DYNAMIC_CPULIST: + seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->prefer_cpus)); + break; +#endif default: ret = -EINVAL; } @@ -1935,7 +2037,15 @@ static struct cftype files[] = { .write_u64 = cpuset_write_u64, .private = FILE_MEMORY_PRESSURE_ENABLED, }, - +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + { + .name = "preferred_cpus", + .seq_show = cpuset_common_seq_show, + .write = cpuset_write_resmask, + .max_write_len = (100U + 6 * NR_CPUS), + .private = FILE_DYNAMIC_CPULIST, + }, +#endif { } /* terminate */ };
@@ -1959,17 +2069,28 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) goto free_cs; if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL)) goto free_cpus; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (!alloc_cpumask_var(&cs->prefer_cpus, GFP_KERNEL)) + goto free_effective_cpus; +#endif
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpumask_clear(cs->cpus_allowed); nodes_clear(cs->mems_allowed); cpumask_clear(cs->effective_cpus); nodes_clear(cs->effective_mems); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_clear(cs->prefer_cpus); +#endif fmeter_init(&cs->fmeter); cs->relax_domain_level = -1;
return &cs->css;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +free_effective_cpus: + free_cpumask_var(cs->effective_cpus); +#endif free_cpus: free_cpumask_var(cs->cpus_allowed); free_cs: @@ -2034,6 +2155,9 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->effective_mems = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); cpumask_copy(cs->effective_cpus, parent->cpus_allowed); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_copy(cs->prefer_cpus, parent->prefer_cpus); +#endif spin_unlock_irq(&callback_lock); out_unlock: mutex_unlock(&cpuset_mutex); @@ -2065,6 +2189,9 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + free_cpumask_var(cs->prefer_cpus); +#endif free_cpumask_var(cs->effective_cpus); free_cpumask_var(cs->cpus_allowed); kfree(cs); @@ -2099,6 +2226,9 @@ static void cpuset_fork(struct task_struct *task) return;
set_cpus_allowed_ptr(task, ¤t->cpus_allowed); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + set_prefer_cpus_ptr(task, current->prefer_cpus); +#endif task->mems_allowed = current->mems_allowed; }
@@ -2129,11 +2259,17 @@ int __init cpuset_init(void)
BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + BUG_ON(!alloc_cpumask_var(&top_cpuset.prefer_cpus, GFP_KERNEL)); +#endif
cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); cpumask_setall(top_cpuset.effective_cpus); nodes_setall(top_cpuset.effective_mems); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_clear(top_cpuset.prefer_cpus); +#endif
fmeter_init(&top_cpuset.fmeter); set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); @@ -2144,6 +2280,9 @@ int __init cpuset_init(void) return err;
BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + BUG_ON(!alloc_cpumask_var(&prefer_cpus_attach, GFP_KERNEL)); +#endif
return 0; } @@ -2180,6 +2319,9 @@ hotplug_update_tasks_legacy(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_t prefer_cpus; +#endif bool is_empty;
spin_lock_irq(&callback_lock); @@ -2198,6 +2340,13 @@ hotplug_update_tasks_legacy(struct cpuset *cs, if (mems_updated && !nodes_empty(cs->mems_allowed)) update_tasks_nodemask(cs);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (!cpumask_subset(cs->prefer_cpus, cs->cpus_allowed)) { + cpumask_and(&prefer_cpus, cs->prefer_cpus, cs->cpus_allowed); + cpumask_copy(cs->prefer_cpus, &prefer_cpus); + update_tasks_prefer_cpumask(cs); + } +#endif is_empty = cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed);
diff --git a/kernel/fork.c b/kernel/fork.c index 7608869f4f1e..4207d5e5958b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -459,6 +459,9 @@ void free_task(struct task_struct *tsk) arch_release_task_struct(tsk); if (tsk->flags & PF_KTHREAD) free_kthread_struct(tsk); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + sched_prefer_cpus_free(tsk); +#endif free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -888,6 +891,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->seccomp.filter = NULL; #endif
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + tsk->prefer_cpus = NULL; +#endif + setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); @@ -1862,6 +1869,12 @@ static __latent_entropy struct task_struct *copy_process( if (retval < 0) goto bad_fork_free;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + retval = sched_prefer_cpus_fork(p, current); + if (retval) + goto bad_fork_free; +#endif + /* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 36d7422da0ac..835f7c6c00ba 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7191,6 +7191,101 @@ static int __maybe_unused cpu_period_quota_parse(char *buf, return 0; }
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +int sched_prefer_cpus_fork(struct task_struct *p, struct task_struct *orig) +{ + p->prefer_cpus = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + if (!p->prefer_cpus) + return -ENOMEM; + + if (orig->prefer_cpus) + cpumask_copy(p->prefer_cpus, orig->prefer_cpus); + else + cpumask_clear(p->prefer_cpus); + + return 0; +} + +void sched_prefer_cpus_free(struct task_struct *p) +{ + kfree(p->prefer_cpus); +} + +static void do_set_prefer_cpus(struct task_struct *p, + const struct cpumask *new_mask) +{ + struct rq *rq = task_rq(p); + bool queued, running; + + lockdep_assert_held(&p->pi_lock); + + queued = task_on_rq_queued(p); + running = task_current(rq, p); + + if (queued) { + /* + * Because __kthread_bind() calls this on blocked tasks without + * holding rq->lock. + */ + lockdep_assert_held(&rq->lock); + dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); + } + if (running) + put_prev_task(rq, p); + + cpumask_copy(p->prefer_cpus, new_mask); + + if (queued) + enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); + if (running) + set_curr_task(rq, p); +} + +/* + * Change a given task's prefer CPU affinity. Prioritize migrate the thread to + * prefer cpus according to preferred bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +static int __set_prefer_cpus_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check) +{ + struct rq_flags rf; + struct rq *rq; + int ret = 0; + + if (unlikely(!p->prefer_cpus)) + return -EINVAL; + + rq = task_rq_lock(p, &rf); + update_rq_clock(rq); + + if (cpumask_equal(p->prefer_cpus, new_mask)) + goto out; + + if (!cpumask_subset(new_mask, &p->cpus_allowed)) { + ret = -EINVAL; + goto out; + } + + do_set_prefer_cpus(p, new_mask); +out: + task_rq_unlock(rq, p, &rf); + + return ret; +} + +int set_prefer_cpus_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ + if (p->sched_class != &fair_sched_class) + return 0; + + return __set_prefer_cpus_ptr(p, new_mask, false); +} +#endif + #ifdef CONFIG_CFS_BANDWIDTH static int cpu_max_show(struct seq_file *sf, void *v) {