-----邮件原件----- 发件人: Xiexiuqi 发送时间: 2023年7月25日 16:26 收件人: tanghui (C) tanghui20@huawei.com; kernel@openeuler.org 抄送: Chenhui (Judy) judy.chenhui@huawei.com; bobo.shaobowang@huawei.comwangshaobo 主题: Re: [PATCH openEuler-23.09 2/7] cpuset: Introduce new interface for scheduler dynamic affinity
On 2023/7/25 14:57, Hui Tang wrote:
From: tanghui tanghui20@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7NS6Y
Add 'prefer_cpus' sysfs and related interface in cgroup cpuset.
Signed-off-by: tanghui tanghui20@huawei.com
fs/proc/base.c | 73 ++++++++++++++++++++ include/linux/sched.h | 10 +++ init/init_task.c | 3 + kernel/cgroup/cpuset.c | 150 ++++++++++++++++++++++++++++++++++++++++- kernel/fork.c | 13 ++++ kernel/sched/core.c | 95 ++++++++++++++++++++++++++ 6 files changed, 343 insertions(+), 1 deletion(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c index 05452c3b9872..7183f338404d 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3165,6 +3165,76 @@ static const struct file_operations proc_setgroups_operations = { }; #endif /* CONFIG_USER_NS */
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+static int preferred_cpuset_show(struct seq_file *m, void *v) {
- struct inode *inode = m->private;
- struct task_struct *p;
- p = get_proc_task(inode);
- if (!p)
return -ESRCH;
- if (p->prefer_cpus)
seq_printf(m, "%*pbl\n", cpumask_pr_args(p->prefer_cpus));
- else
seq_putc(m, '\n');
- put_task_struct(p);
- return 0;
+}
+static ssize_t preferred_cpuset_write(struct file *file, const char __user *buf,
size_t count, loff_t *offset)
+{
- cpumask_var_t new_mask;
- int retval;
- struct inode *inode = file_inode(file);
- struct task_struct *p;
- p = get_proc_task(inode);
- if (!p)
return -ESRCH;
- if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
retval = -ENOMEM;
goto out_put_task;
- }
- retval = cpumask_parselist_user(buf, count, new_mask);
- if (retval < 0)
goto out_free_cpumask;
- retval = set_prefer_cpus_ptr(p, new_mask);
- if (retval < 0)
goto out_free_cpumask;
- retval = count;
+out_free_cpumask:
- free_cpumask_var(new_mask);
+out_put_task:
- put_task_struct(p);
- return retval;
+}
+static int preferred_cpuset_open(struct inode *inode, struct file +*filp) {
- return single_open(filp, preferred_cpuset_show, inode); }
+static const struct file_operations proc_preferred_cpuset_operations = {
- .open = preferred_cpuset_open,
- .write = preferred_cpuset_write,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
+}; +#endif
static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -3689,6 +3759,9 @@ static const struct pid_entry tid_base_stuff[] = { ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), #endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- REG("preferred_cpuset", 0644, proc_preferred_cpuset_operations),
+#endif };
static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/include/linux/sched.h b/include/linux/sched.h index eed5d65b8d1f..ce6208bfb530 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1534,6 +1534,10 @@ struct task_struct { struct user_event_mm *user_event_mm; #endif
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- cpumask_t *prefer_cpus;
+#endif
- /*
- New fields for task_struct should be added above here, so that
- they are included in the randomized portion of task_struct.
@@ -2444,4 +2448,10 @@ static inline void sched_core_fork(struct task_struct *p) { }
extern void sched_set_stop_task(int cpu, struct task_struct *stop);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY int +set_prefer_cpus_ptr(struct task_struct *p,
const struct cpumask *new_mask);
+int sched_prefer_cpus_fork(struct task_struct *p, struct cpumask +*mask); void sched_prefer_cpus_free(struct task_struct *p); #endif #endif diff --git a/init/init_task.c b/init/init_task.c index ff6c4b9bfe6b..ac0c5850f74b 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -207,6 +207,9 @@ struct task_struct init_task #ifdef CONFIG_SECURITY .security = NULL, #endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- .prefer_cpus = NULL,
+#endif #ifdef CONFIG_SECCOMP_FILTER .seccomp = { .filter_count = ATOMIC_INIT(0) }, #endif diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index e4ca2dd2b764..0862dbeca1c8 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -138,6 +138,9 @@ struct cpuset { /* user-configured CPUs and Memory Nodes allow to tasks */ cpumask_var_t cpus_allowed; nodemask_t mems_allowed; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- cpumask_var_t prefer_cpus;
+#endif
/* effective CPUs and Memory Nodes allow to tasks */ cpumask_var_t effective_cpus; @@ -227,6 +230,9 @@ static inline bool is_prs_invalid(int prs_state) struct tmpmasks { cpumask_var_t addmask, delmask; /* For partition root */ cpumask_var_t new_cpus; /* For update_cpumasks_hier() */ +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- cpumask_var_t prefer_cpus;
+#endif };
static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) @@ -597,15 +603,24 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) { cpumask_var_t *pmask1, *pmask2, *pmask3; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- cpumask_var_t *pmask4;
+#endif
if (cs) { pmask1 = &cs->cpus_allowed; pmask2 = &cs->effective_cpus; pmask3 = &cs->subparts_cpus; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
pmask4 = &cs->prefer_cpus;
+#endif } else { pmask1 = &tmp->new_cpus; pmask2 = &tmp->addmask; pmask3 = &tmp->delmask; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
pmask4 = &tmp->prefer_cpus;
+#endif }
if (!zalloc_cpumask_var(pmask1, GFP_KERNEL)) @@ -616,9 +631,17 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
if (!zalloc_cpumask_var(pmask3, GFP_KERNEL)) goto free_two; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- if (!zalloc_cpumask_var(pmask4, GFP_KERNEL))
goto free_three;
+#endif
return 0;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +free_three:
- free_cpumask_var(*pmask3);
+#endif free_two: free_cpumask_var(*pmask2); free_one: @@ -634,11 +657,17 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) { if (cs) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
free_cpumask_var(cs->prefer_cpus);
+#endif free_cpumask_var(cs->cpus_allowed); free_cpumask_var(cs->effective_cpus); free_cpumask_var(cs->subparts_cpus); } if (tmp) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
free_cpumask_var(tmp->prefer_cpus);
+#endif free_cpumask_var(tmp->new_cpus); free_cpumask_var(tmp->addmask); free_cpumask_var(tmp->delmask); @@ -662,6 +691,9 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) return NULL; }
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- cpumask_copy(trial->prefer_cpus, cs->prefer_cpus); #endif cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); cpumask_copy(trial->effective_cpus, cs->effective_cpus); return trial;
@@ -743,6 +775,12 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) if (cur == &top_cpuset) goto out;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- ret = -EINVAL;
- if (!cpumask_subset(cur->prefer_cpus, trial->cpus_allowed))
goto out;
+#endif
par = parent_cs(cur);
/*
@@ -791,6 +829,66 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) return ret; }
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY static cpumask_var_t +prefer_cpus_attach;
+static void update_tasks_prefer_cpumask(struct cpuset *cs) {
- struct css_task_iter it;
- struct task_struct *task;
- css_task_iter_start(&cs->css, 0, &it);
- while ((task = css_task_iter_next(&it)))
set_prefer_cpus_ptr(task, cs->prefer_cpus);
- css_task_iter_end(&it);
+}
+/*
- update_prefer_cpumask - update the prefer_cpus mask of a cpuset and
all tasks in it
- @cs: the cpuset to consider
- @trialcs: trial cpuset
- @buf: buffer of cpu numbers written to this cpuset */ static int
+update_prefer_cpumask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
+{
- int retval;
- if (cs == &top_cpuset)
return -EACCES;
- /*
* An empty prefer_cpus is ok which mean that the cpuset tasks disable
* dynamic affinity feature.
* Since cpulist_parse() fails on an empty mask, we special case
* that parsing.
*/
- if (!*buf) {
cpumask_clear(trialcs->prefer_cpus);
- } else {
retval = cpulist_parse(buf, trialcs->prefer_cpus);
if (retval < 0)
return retval;
- }
- /* Nothing to do if the cpus didn't change */
- if (cpumask_equal(cs->prefer_cpus, trialcs->prefer_cpus))
return 0;
- if (!cpumask_subset(trialcs->prefer_cpus, cs->cpus_allowed))
return -EINVAL;
- update_tasks_prefer_cpumask(trialcs);
- spin_lock_irq(&callback_lock);
- cpumask_copy(cs->prefer_cpus, trialcs->prefer_cpus);
- spin_unlock_irq(&callback_lock);
- return 0;
+} +#endif
#ifdef CONFIG_SMP /*
- Helper routine for generate_sched_domains().
@@ -2547,6 +2645,10 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) * fail. TODO: have a better way to handle failure here */ WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
cpumask_copy(prefer_cpus_attach, cs->prefer_cpus);
set_prefer_cpus_ptr(task, prefer_cpus_attach); #endif
cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); cpuset_update_task_spread_flags(cs, task); @@ -2648,6 +2750,9 @@
typedef enum { FILE_MEMORY_PRESSURE, FILE_SPREAD_PAGE, FILE_SPREAD_SLAB, +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- FILE_DYNAMIC_CPULIST,
+#endif } cpuset_filetype_t;
static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, @@ -2778,6 +2883,11 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, case FILE_MEMLIST: retval = update_nodemask(cs, trialcs, buf); break; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- case FILE_DYNAMIC_CPULIST:
retval = update_prefer_cpumask(cs, trialcs, buf);
break;
+#endif default: retval = -EINVAL; break; @@ -2825,6 +2935,11 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) case FILE_SUBPARTS_CPULIST: seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus)); break; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- case FILE_DYNAMIC_CPULIST:
seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->prefer_cpus));
break;
+#endif default: ret = -EINVAL; } @@ -3047,7 +3162,15 @@ static struct cftype legacy_files[] = { .write_u64 = cpuset_write_u64, .private = FILE_MEMORY_PRESSURE_ENABLED, },
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- {
.name = "preferred_cpus",
.seq_show = cpuset_common_seq_show,
.write = cpuset_write_resmask,
.max_write_len = (100U + 6 * NR_CPUS),
.private = FILE_DYNAMIC_CPULIST,
- },
+#endif { } /* terminate */ };
@@ -3205,6 +3328,9 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->effective_mems = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); cpumask_copy(cs->effective_cpus, parent->cpus_allowed); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- cpumask_copy(cs->prefer_cpus, parent->prefer_cpus); #endif spin_unlock_irq(&callback_lock);
out_unlock: percpu_up_write(&cpuset_rwsem); @@ -3358,6 +3484,9 @@ static void cpuset_fork(struct task_struct *task) return;
set_cpus_allowed_ptr(task, current->cpus_ptr);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
task->mems_allowed = current->mems_allowed; return; }set_prefer_cpus_ptr(task, current->prefer_cpus); #endif
@@ -3404,17 +3533,26 @@ int __init cpuset_init(void) BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- BUG_ON(!alloc_cpumask_var(&top_cpuset.prefer_cpus, GFP_KERNEL));
+#endif
cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); cpumask_setall(top_cpuset.effective_cpus); nodes_setall(top_cpuset.effective_mems); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- cpumask_clear(top_cpuset.prefer_cpus);
+#endif
fmeter_init(&top_cpuset.fmeter); set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); top_cpuset.relax_domain_level = -1;
BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
BUG_ON(!alloc_cpumask_var(&prefer_cpus_attach, GFP_KERNEL)); #endif
return 0;
} @@ -3451,6 +3589,9 @@ hotplug_update_tasks_legacy(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- cpumask_t prefer_cpus;
+#endif bool is_empty;
spin_lock_irq(&callback_lock); @@ -3469,6 +3610,13 @@ hotplug_update_tasks_legacy(struct cpuset *cs, if (mems_updated && !nodes_empty(cs->mems_allowed)) update_tasks_nodemask(cs);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- if (!cpumask_subset(cs->prefer_cpus, cs->cpus_allowed)) {
cpumask_and(&prefer_cpus, cs->prefer_cpus, cs->cpus_allowed);
cpumask_copy(cs->prefer_cpus, &prefer_cpus);
update_tasks_prefer_cpumask(cs);
- }
+#endif is_empty = cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed);
diff --git a/kernel/fork.c b/kernel/fork.c index 41c964104b58..b6ba64f94822 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -628,6 +628,9 @@ void free_task(struct task_struct *tsk) if (tsk->flags & PF_KTHREAD) free_kthread_struct(tsk); bpf_task_storage_free(tsk); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- sched_prefer_cpus_free(tsk);
+#endif free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -1136,6 +1139,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->seccomp.filter = NULL; #endif
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- tsk->prefer_cpus = NULL;
+#endif
- setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk);
@@ -2360,6 +2367,12 @@ __latent_entropy struct task_struct *copy_process(
rt_mutex_init_task(p);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- retval = sched_prefer_cpus_fork(p, current->prefer_cpus);
- if (retval)
goto bad_fork_free;> +#endif
- lockdep_assert_irqs_enabled();
#ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a68d1276bab0..836a3b93ca6d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11330,6 +11330,101 @@ static int __maybe_unused cpu_period_quota_parse(char *buf, return 0; }
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY int +sched_prefer_cpus_fork(struct task_struct *p, struct cpumask *mask) {
- p->prefer_cpus = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
在 CONFIG 使能时,每个 task 都分配了 prefer_cpus, 在 fork 路径动态分配会增加 fork 的 overhead,如果不使用该特性,能否消除掉? 比如: 1. 增加启动开关,不使能时,这些动作不需要做; 2. 如果将 perfer_cpus 作为 task_struct 的一部分,是否管理起来更高效,直接随 task_struct 分配和释放。
该特性,能否默认不开启,需要时再开启?
------ patch 6 增加了cmdline, 默认不使能,需要场景通过cmdline打开
- if (!p->prefer_cpus)
return -ENOMEM;
- if (mask)
cpumask_copy(p->prefer_cpus, mask);
- else
cpumask_clear(p->prefer_cpus);
- return 0;
+}
+void sched_prefer_cpus_free(struct task_struct *p) {
- kfree(p->prefer_cpus);
+}
+static void do_set_prefer_cpus(struct task_struct *p,
const struct cpumask *new_mask)
+{
- struct rq *rq = task_rq(p);
- bool queued, running;
- lockdep_assert_held(&p->pi_lock);
- queued = task_on_rq_queued(p);
- running = task_current(rq, p);
- if (queued) {
/*
* Because __kthread_bind() calls this on blocked tasks without
* holding rq->lock.
*/
lockdep_assert_held(&rq->__lock);
dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
- }
- if (running)
put_prev_task(rq, p);
- cpumask_copy(p->prefer_cpus, new_mask);
- if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- if (running)
set_next_task(rq, p);
+}
+/*
- Change a given task's prefer CPU affinity. Prioritize migrate the
+thread to
- prefer cpus according to preferred bitmask.
- NOTE: the caller must have a valid reference to the task, the
- task must not exit() & deallocate itself prematurely. The
- call is not atomic; no spinlocks may be held.
- */
+static int __set_prefer_cpus_ptr(struct task_struct *p,
const struct cpumask *new_mask, bool check) {
- struct rq_flags rf;
- struct rq *rq;
- int ret = 0;
- if (unlikely(!p->prefer_cpus))
return -EINVAL;
- rq = task_rq_lock(p, &rf);
- update_rq_clock(rq);
- if (cpumask_equal(p->prefer_cpus, new_mask))
goto out;
- if (!cpumask_subset(new_mask, p->cpus_ptr)) {
ret = -EINVAL;
goto out;
- }
- do_set_prefer_cpus(p, new_mask);
+out:
- task_rq_unlock(rq, p, &rf);
- return ret;
+}
+int set_prefer_cpus_ptr(struct task_struct *p, const struct cpumask +*new_mask) {
- if (p->sched_class != &fair_sched_class)
return 0;
- return __set_prefer_cpus_ptr(p, new_mask, false); } #endif
#ifdef CONFIG_CFS_BANDWIDTH static int cpu_max_show(struct seq_file *sf, void *v) {