From: tanghui tanghui20@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7NS6Y
--------------------------------
Dynamic affinity set preferred cpus for task. When the utilization of taskgroup's preferred cpu is low, task only run in cpus preferred to enhance cpu resource locality and reduce interference between task cgroups, otherwise task can burst preferred cpus to use external cpu within cpus allowed.
Signed-off-by: tanghui tanghui20@huawei.com --- init/Kconfig | 10 ++++++++++ 1 file changed, 10 insertions(+)
diff --git a/init/Kconfig b/init/Kconfig index 32c24950c4ce..7a3299a632e0 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1013,6 +1013,16 @@ config RT_GROUP_SCHED
endif #CGROUP_SCHED
+config QOS_SCHED_DYNAMIC_AFFINITY + bool "qos dynamic affinity" + depends on CPUSETS + default n + help + This feature lets you allocate preferred cpus to taskgroup. If enabled, + it will make taskgroup only to use preferred cpus when cpu utilization + of taskgroup is below threshold setted, otherwise make taskgroup to use + cpus allowed. + config SCHED_MM_CID def_bool y depends on SMP && RSEQ
From: tanghui tanghui20@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7NS6Y
--------------------------------
Add 'prefer_cpus' sysfs and related interface in cgroup cpuset.
Signed-off-by: tanghui tanghui20@huawei.com --- fs/proc/base.c | 73 ++++++++++++++++++++ include/linux/sched.h | 10 +++ init/init_task.c | 3 + kernel/cgroup/cpuset.c | 150 ++++++++++++++++++++++++++++++++++++++++- kernel/fork.c | 13 ++++ kernel/sched/core.c | 95 ++++++++++++++++++++++++++ 6 files changed, 343 insertions(+), 1 deletion(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c index 05452c3b9872..7183f338404d 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3165,6 +3165,76 @@ static const struct file_operations proc_setgroups_operations = { }; #endif /* CONFIG_USER_NS */
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + +static int preferred_cpuset_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (p->prefer_cpus) + seq_printf(m, "%*pbl\n", cpumask_pr_args(p->prefer_cpus)); + else + seq_putc(m, '\n'); + + put_task_struct(p); + + return 0; +} + +static ssize_t preferred_cpuset_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + cpumask_var_t new_mask; + int retval; + struct inode *inode = file_inode(file); + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_put_task; + } + + retval = cpumask_parselist_user(buf, count, new_mask); + if (retval < 0) + goto out_free_cpumask; + + retval = set_prefer_cpus_ptr(p, new_mask); + if (retval < 0) + goto out_free_cpumask; + + retval = count; + +out_free_cpumask: + free_cpumask_var(new_mask); +out_put_task: + put_task_struct(p); + + return retval; +} + +static int preferred_cpuset_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, preferred_cpuset_show, inode); +} + +static const struct file_operations proc_preferred_cpuset_operations = { + .open = preferred_cpuset_open, + .write = preferred_cpuset_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -3689,6 +3759,9 @@ static const struct pid_entry tid_base_stuff[] = { ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), #endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + REG("preferred_cpuset", 0644, proc_preferred_cpuset_operations), +#endif };
static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/include/linux/sched.h b/include/linux/sched.h index eed5d65b8d1f..ce6208bfb530 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1534,6 +1534,10 @@ struct task_struct { struct user_event_mm *user_event_mm; #endif
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_t *prefer_cpus; +#endif + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. @@ -2444,4 +2448,10 @@ static inline void sched_core_fork(struct task_struct *p) { }
extern void sched_set_stop_task(int cpu, struct task_struct *stop);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +int set_prefer_cpus_ptr(struct task_struct *p, + const struct cpumask *new_mask); +int sched_prefer_cpus_fork(struct task_struct *p, struct cpumask *mask); +void sched_prefer_cpus_free(struct task_struct *p); +#endif #endif diff --git a/init/init_task.c b/init/init_task.c index ff6c4b9bfe6b..ac0c5850f74b 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -207,6 +207,9 @@ struct task_struct init_task #ifdef CONFIG_SECURITY .security = NULL, #endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + .prefer_cpus = NULL, +#endif #ifdef CONFIG_SECCOMP_FILTER .seccomp = { .filter_count = ATOMIC_INIT(0) }, #endif diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index e4ca2dd2b764..0862dbeca1c8 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -138,6 +138,9 @@ struct cpuset { /* user-configured CPUs and Memory Nodes allow to tasks */ cpumask_var_t cpus_allowed; nodemask_t mems_allowed; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_var_t prefer_cpus; +#endif
/* effective CPUs and Memory Nodes allow to tasks */ cpumask_var_t effective_cpus; @@ -227,6 +230,9 @@ static inline bool is_prs_invalid(int prs_state) struct tmpmasks { cpumask_var_t addmask, delmask; /* For partition root */ cpumask_var_t new_cpus; /* For update_cpumasks_hier() */ +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_var_t prefer_cpus; +#endif };
static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) @@ -597,15 +603,24 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) { cpumask_var_t *pmask1, *pmask2, *pmask3; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_var_t *pmask4; +#endif
if (cs) { pmask1 = &cs->cpus_allowed; pmask2 = &cs->effective_cpus; pmask3 = &cs->subparts_cpus; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + pmask4 = &cs->prefer_cpus; +#endif } else { pmask1 = &tmp->new_cpus; pmask2 = &tmp->addmask; pmask3 = &tmp->delmask; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + pmask4 = &tmp->prefer_cpus; +#endif }
if (!zalloc_cpumask_var(pmask1, GFP_KERNEL)) @@ -616,9 +631,17 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
if (!zalloc_cpumask_var(pmask3, GFP_KERNEL)) goto free_two; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (!zalloc_cpumask_var(pmask4, GFP_KERNEL)) + goto free_three; +#endif
return 0;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +free_three: + free_cpumask_var(*pmask3); +#endif free_two: free_cpumask_var(*pmask2); free_one: @@ -634,11 +657,17 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) { if (cs) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + free_cpumask_var(cs->prefer_cpus); +#endif free_cpumask_var(cs->cpus_allowed); free_cpumask_var(cs->effective_cpus); free_cpumask_var(cs->subparts_cpus); } if (tmp) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + free_cpumask_var(tmp->prefer_cpus); +#endif free_cpumask_var(tmp->new_cpus); free_cpumask_var(tmp->addmask); free_cpumask_var(tmp->delmask); @@ -662,6 +691,9 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) return NULL; }
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_copy(trial->prefer_cpus, cs->prefer_cpus); +#endif cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); cpumask_copy(trial->effective_cpus, cs->effective_cpus); return trial; @@ -743,6 +775,12 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) if (cur == &top_cpuset) goto out;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + ret = -EINVAL; + if (!cpumask_subset(cur->prefer_cpus, trial->cpus_allowed)) + goto out; +#endif + par = parent_cs(cur);
/* @@ -791,6 +829,66 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) return ret; }
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +static cpumask_var_t prefer_cpus_attach; + +static void update_tasks_prefer_cpumask(struct cpuset *cs) +{ + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&cs->css, 0, &it); + while ((task = css_task_iter_next(&it))) + set_prefer_cpus_ptr(task, cs->prefer_cpus); + css_task_iter_end(&it); +} + +/* + * update_prefer_cpumask - update the prefer_cpus mask of a cpuset and + * all tasks in it + * @cs: the cpuset to consider + * @trialcs: trial cpuset + * @buf: buffer of cpu numbers written to this cpuset + */ +static int update_prefer_cpumask(struct cpuset *cs, struct cpuset *trialcs, + const char *buf) +{ + int retval; + + if (cs == &top_cpuset) + return -EACCES; + + /* + * An empty prefer_cpus is ok which mean that the cpuset tasks disable + * dynamic affinity feature. + * Since cpulist_parse() fails on an empty mask, we special case + * that parsing. + */ + if (!*buf) { + cpumask_clear(trialcs->prefer_cpus); + } else { + retval = cpulist_parse(buf, trialcs->prefer_cpus); + if (retval < 0) + return retval; + } + + /* Nothing to do if the cpus didn't change */ + if (cpumask_equal(cs->prefer_cpus, trialcs->prefer_cpus)) + return 0; + + if (!cpumask_subset(trialcs->prefer_cpus, cs->cpus_allowed)) + return -EINVAL; + + update_tasks_prefer_cpumask(trialcs); + + spin_lock_irq(&callback_lock); + cpumask_copy(cs->prefer_cpus, trialcs->prefer_cpus); + spin_unlock_irq(&callback_lock); + + return 0; +} +#endif + #ifdef CONFIG_SMP /* * Helper routine for generate_sched_domains(). @@ -2547,6 +2645,10 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) * fail. TODO: have a better way to handle failure here */ WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_copy(prefer_cpus_attach, cs->prefer_cpus); + set_prefer_cpus_ptr(task, prefer_cpus_attach); +#endif
cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); cpuset_update_task_spread_flags(cs, task); @@ -2648,6 +2750,9 @@ typedef enum { FILE_MEMORY_PRESSURE, FILE_SPREAD_PAGE, FILE_SPREAD_SLAB, +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + FILE_DYNAMIC_CPULIST, +#endif } cpuset_filetype_t;
static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, @@ -2778,6 +2883,11 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, case FILE_MEMLIST: retval = update_nodemask(cs, trialcs, buf); break; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + case FILE_DYNAMIC_CPULIST: + retval = update_prefer_cpumask(cs, trialcs, buf); + break; +#endif default: retval = -EINVAL; break; @@ -2825,6 +2935,11 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) case FILE_SUBPARTS_CPULIST: seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus)); break; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + case FILE_DYNAMIC_CPULIST: + seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->prefer_cpus)); + break; +#endif default: ret = -EINVAL; } @@ -3047,7 +3162,15 @@ static struct cftype legacy_files[] = { .write_u64 = cpuset_write_u64, .private = FILE_MEMORY_PRESSURE_ENABLED, }, - +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + { + .name = "preferred_cpus", + .seq_show = cpuset_common_seq_show, + .write = cpuset_write_resmask, + .max_write_len = (100U + 6 * NR_CPUS), + .private = FILE_DYNAMIC_CPULIST, + }, +#endif { } /* terminate */ };
@@ -3205,6 +3328,9 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->effective_mems = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); cpumask_copy(cs->effective_cpus, parent->cpus_allowed); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_copy(cs->prefer_cpus, parent->prefer_cpus); +#endif spin_unlock_irq(&callback_lock); out_unlock: percpu_up_write(&cpuset_rwsem); @@ -3358,6 +3484,9 @@ static void cpuset_fork(struct task_struct *task) return;
set_cpus_allowed_ptr(task, current->cpus_ptr); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + set_prefer_cpus_ptr(task, current->prefer_cpus); +#endif task->mems_allowed = current->mems_allowed; return; } @@ -3404,17 +3533,26 @@ int __init cpuset_init(void) BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + BUG_ON(!alloc_cpumask_var(&top_cpuset.prefer_cpus, GFP_KERNEL)); +#endif
cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); cpumask_setall(top_cpuset.effective_cpus); nodes_setall(top_cpuset.effective_mems); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_clear(top_cpuset.prefer_cpus); +#endif
fmeter_init(&top_cpuset.fmeter); set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); top_cpuset.relax_domain_level = -1;
BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + BUG_ON(!alloc_cpumask_var(&prefer_cpus_attach, GFP_KERNEL)); +#endif
return 0; } @@ -3451,6 +3589,9 @@ hotplug_update_tasks_legacy(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_t prefer_cpus; +#endif bool is_empty;
spin_lock_irq(&callback_lock); @@ -3469,6 +3610,13 @@ hotplug_update_tasks_legacy(struct cpuset *cs, if (mems_updated && !nodes_empty(cs->mems_allowed)) update_tasks_nodemask(cs);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (!cpumask_subset(cs->prefer_cpus, cs->cpus_allowed)) { + cpumask_and(&prefer_cpus, cs->prefer_cpus, cs->cpus_allowed); + cpumask_copy(cs->prefer_cpus, &prefer_cpus); + update_tasks_prefer_cpumask(cs); + } +#endif is_empty = cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed);
diff --git a/kernel/fork.c b/kernel/fork.c index 41c964104b58..b6ba64f94822 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -628,6 +628,9 @@ void free_task(struct task_struct *tsk) if (tsk->flags & PF_KTHREAD) free_kthread_struct(tsk); bpf_task_storage_free(tsk); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + sched_prefer_cpus_free(tsk); +#endif free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -1136,6 +1139,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->seccomp.filter = NULL; #endif
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + tsk->prefer_cpus = NULL; +#endif + setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); @@ -2360,6 +2367,12 @@ __latent_entropy struct task_struct *copy_process(
rt_mutex_init_task(p);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + retval = sched_prefer_cpus_fork(p, current->prefer_cpus); + if (retval) + goto bad_fork_free; +#endif + lockdep_assert_irqs_enabled(); #ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a68d1276bab0..836a3b93ca6d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11330,6 +11330,101 @@ static int __maybe_unused cpu_period_quota_parse(char *buf, return 0; }
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +int sched_prefer_cpus_fork(struct task_struct *p, struct cpumask *mask) +{ + p->prefer_cpus = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + if (!p->prefer_cpus) + return -ENOMEM; + + if (mask) + cpumask_copy(p->prefer_cpus, mask); + else + cpumask_clear(p->prefer_cpus); + + return 0; +} + +void sched_prefer_cpus_free(struct task_struct *p) +{ + kfree(p->prefer_cpus); +} + +static void do_set_prefer_cpus(struct task_struct *p, + const struct cpumask *new_mask) +{ + struct rq *rq = task_rq(p); + bool queued, running; + + lockdep_assert_held(&p->pi_lock); + + queued = task_on_rq_queued(p); + running = task_current(rq, p); + + if (queued) { + /* + * Because __kthread_bind() calls this on blocked tasks without + * holding rq->lock. + */ + lockdep_assert_held(&rq->__lock); + dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); + } + if (running) + put_prev_task(rq, p); + + cpumask_copy(p->prefer_cpus, new_mask); + + if (queued) + enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); + if (running) + set_next_task(rq, p); +} + +/* + * Change a given task's prefer CPU affinity. Prioritize migrate the thread to + * prefer cpus according to preferred bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +static int __set_prefer_cpus_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check) +{ + struct rq_flags rf; + struct rq *rq; + int ret = 0; + + if (unlikely(!p->prefer_cpus)) + return -EINVAL; + + rq = task_rq_lock(p, &rf); + update_rq_clock(rq); + + if (cpumask_equal(p->prefer_cpus, new_mask)) + goto out; + + if (!cpumask_subset(new_mask, p->cpus_ptr)) { + ret = -EINVAL; + goto out; + } + + do_set_prefer_cpus(p, new_mask); +out: + task_rq_unlock(rq, p, &rf); + + return ret; +} + +int set_prefer_cpus_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ + if (p->sched_class != &fair_sched_class) + return 0; + + return __set_prefer_cpus_ptr(p, new_mask, false); +} +#endif + #ifdef CONFIG_CFS_BANDWIDTH static int cpu_max_show(struct seq_file *sf, void *v) {
On 2023/7/25 14:57, Hui Tang wrote:
From: tanghui tanghui20@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7NS6Y
Add 'prefer_cpus' sysfs and related interface in cgroup cpuset.
Signed-off-by: tanghui tanghui20@huawei.com
fs/proc/base.c | 73 ++++++++++++++++++++ include/linux/sched.h | 10 +++ init/init_task.c | 3 + kernel/cgroup/cpuset.c | 150 ++++++++++++++++++++++++++++++++++++++++- kernel/fork.c | 13 ++++ kernel/sched/core.c | 95 ++++++++++++++++++++++++++ 6 files changed, 343 insertions(+), 1 deletion(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c index 05452c3b9872..7183f338404d 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3165,6 +3165,76 @@ static const struct file_operations proc_setgroups_operations = { }; #endif /* CONFIG_USER_NS */
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+static int preferred_cpuset_show(struct seq_file *m, void *v) +{
- struct inode *inode = m->private;
- struct task_struct *p;
- p = get_proc_task(inode);
- if (!p)
return -ESRCH;
- if (p->prefer_cpus)
seq_printf(m, "%*pbl\n", cpumask_pr_args(p->prefer_cpus));
- else
seq_putc(m, '\n');
- put_task_struct(p);
- return 0;
+}
+static ssize_t preferred_cpuset_write(struct file *file, const char __user *buf,
size_t count, loff_t *offset)
+{
- cpumask_var_t new_mask;
- int retval;
- struct inode *inode = file_inode(file);
- struct task_struct *p;
- p = get_proc_task(inode);
- if (!p)
return -ESRCH;
- if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
retval = -ENOMEM;
goto out_put_task;
- }
- retval = cpumask_parselist_user(buf, count, new_mask);
- if (retval < 0)
goto out_free_cpumask;
- retval = set_prefer_cpus_ptr(p, new_mask);
- if (retval < 0)
goto out_free_cpumask;
- retval = count;
+out_free_cpumask:
- free_cpumask_var(new_mask);
+out_put_task:
- put_task_struct(p);
- return retval;
+}
+static int preferred_cpuset_open(struct inode *inode, struct file *filp) +{
- return single_open(filp, preferred_cpuset_show, inode);
+}
+static const struct file_operations proc_preferred_cpuset_operations = {
- .open = preferred_cpuset_open,
- .write = preferred_cpuset_write,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
+}; +#endif
static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -3689,6 +3759,9 @@ static const struct pid_entry tid_base_stuff[] = { ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), #endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- REG("preferred_cpuset", 0644, proc_preferred_cpuset_operations),
+#endif };
static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/include/linux/sched.h b/include/linux/sched.h index eed5d65b8d1f..ce6208bfb530 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1534,6 +1534,10 @@ struct task_struct { struct user_event_mm *user_event_mm; #endif
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- cpumask_t *prefer_cpus;
+#endif
- /*
- New fields for task_struct should be added above here, so that
- they are included in the randomized portion of task_struct.
@@ -2444,4 +2448,10 @@ static inline void sched_core_fork(struct task_struct *p) { }
extern void sched_set_stop_task(int cpu, struct task_struct *stop);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +int set_prefer_cpus_ptr(struct task_struct *p,
const struct cpumask *new_mask);
+int sched_prefer_cpus_fork(struct task_struct *p, struct cpumask *mask); +void sched_prefer_cpus_free(struct task_struct *p); +#endif #endif diff --git a/init/init_task.c b/init/init_task.c index ff6c4b9bfe6b..ac0c5850f74b 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -207,6 +207,9 @@ struct task_struct init_task #ifdef CONFIG_SECURITY .security = NULL, #endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- .prefer_cpus = NULL,
+#endif #ifdef CONFIG_SECCOMP_FILTER .seccomp = { .filter_count = ATOMIC_INIT(0) }, #endif diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index e4ca2dd2b764..0862dbeca1c8 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -138,6 +138,9 @@ struct cpuset { /* user-configured CPUs and Memory Nodes allow to tasks */ cpumask_var_t cpus_allowed; nodemask_t mems_allowed; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- cpumask_var_t prefer_cpus;
+#endif
/* effective CPUs and Memory Nodes allow to tasks */ cpumask_var_t effective_cpus; @@ -227,6 +230,9 @@ static inline bool is_prs_invalid(int prs_state) struct tmpmasks { cpumask_var_t addmask, delmask; /* For partition root */ cpumask_var_t new_cpus; /* For update_cpumasks_hier() */ +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- cpumask_var_t prefer_cpus;
+#endif };
static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) @@ -597,15 +603,24 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) { cpumask_var_t *pmask1, *pmask2, *pmask3; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- cpumask_var_t *pmask4;
+#endif
if (cs) { pmask1 = &cs->cpus_allowed; pmask2 = &cs->effective_cpus; pmask3 = &cs->subparts_cpus; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
pmask4 = &cs->prefer_cpus;
+#endif } else { pmask1 = &tmp->new_cpus; pmask2 = &tmp->addmask; pmask3 = &tmp->delmask; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
pmask4 = &tmp->prefer_cpus;
+#endif }
if (!zalloc_cpumask_var(pmask1, GFP_KERNEL)) @@ -616,9 +631,17 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
if (!zalloc_cpumask_var(pmask3, GFP_KERNEL)) goto free_two; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- if (!zalloc_cpumask_var(pmask4, GFP_KERNEL))
goto free_three;
+#endif
return 0;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +free_three:
- free_cpumask_var(*pmask3);
+#endif free_two: free_cpumask_var(*pmask2); free_one: @@ -634,11 +657,17 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) { if (cs) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
free_cpumask_var(cs->prefer_cpus);
+#endif free_cpumask_var(cs->cpus_allowed); free_cpumask_var(cs->effective_cpus); free_cpumask_var(cs->subparts_cpus); } if (tmp) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
free_cpumask_var(tmp->prefer_cpus);
+#endif free_cpumask_var(tmp->new_cpus); free_cpumask_var(tmp->addmask); free_cpumask_var(tmp->delmask); @@ -662,6 +691,9 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) return NULL; }
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- cpumask_copy(trial->prefer_cpus, cs->prefer_cpus);
+#endif cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); cpumask_copy(trial->effective_cpus, cs->effective_cpus); return trial; @@ -743,6 +775,12 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) if (cur == &top_cpuset) goto out;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- ret = -EINVAL;
- if (!cpumask_subset(cur->prefer_cpus, trial->cpus_allowed))
goto out;
+#endif
par = parent_cs(cur);
/*
@@ -791,6 +829,66 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) return ret; }
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +static cpumask_var_t prefer_cpus_attach;
+static void update_tasks_prefer_cpumask(struct cpuset *cs) +{
- struct css_task_iter it;
- struct task_struct *task;
- css_task_iter_start(&cs->css, 0, &it);
- while ((task = css_task_iter_next(&it)))
set_prefer_cpus_ptr(task, cs->prefer_cpus);
- css_task_iter_end(&it);
+}
+/*
- update_prefer_cpumask - update the prefer_cpus mask of a cpuset and
all tasks in it
- @cs: the cpuset to consider
- @trialcs: trial cpuset
- @buf: buffer of cpu numbers written to this cpuset
- */
+static int update_prefer_cpumask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
+{
- int retval;
- if (cs == &top_cpuset)
return -EACCES;
- /*
* An empty prefer_cpus is ok which mean that the cpuset tasks disable
* dynamic affinity feature.
* Since cpulist_parse() fails on an empty mask, we special case
* that parsing.
*/
- if (!*buf) {
cpumask_clear(trialcs->prefer_cpus);
- } else {
retval = cpulist_parse(buf, trialcs->prefer_cpus);
if (retval < 0)
return retval;
- }
- /* Nothing to do if the cpus didn't change */
- if (cpumask_equal(cs->prefer_cpus, trialcs->prefer_cpus))
return 0;
- if (!cpumask_subset(trialcs->prefer_cpus, cs->cpus_allowed))
return -EINVAL;
- update_tasks_prefer_cpumask(trialcs);
- spin_lock_irq(&callback_lock);
- cpumask_copy(cs->prefer_cpus, trialcs->prefer_cpus);
- spin_unlock_irq(&callback_lock);
- return 0;
+} +#endif
#ifdef CONFIG_SMP /*
- Helper routine for generate_sched_domains().
@@ -2547,6 +2645,10 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) * fail. TODO: have a better way to handle failure here */ WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- cpumask_copy(prefer_cpus_attach, cs->prefer_cpus);
- set_prefer_cpus_ptr(task, prefer_cpus_attach);
+#endif
cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); cpuset_update_task_spread_flags(cs, task); @@ -2648,6 +2750,9 @@ typedef enum { FILE_MEMORY_PRESSURE, FILE_SPREAD_PAGE, FILE_SPREAD_SLAB, +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- FILE_DYNAMIC_CPULIST,
+#endif } cpuset_filetype_t;
static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, @@ -2778,6 +2883,11 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, case FILE_MEMLIST: retval = update_nodemask(cs, trialcs, buf); break; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- case FILE_DYNAMIC_CPULIST:
retval = update_prefer_cpumask(cs, trialcs, buf);
break;
+#endif default: retval = -EINVAL; break; @@ -2825,6 +2935,11 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) case FILE_SUBPARTS_CPULIST: seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus)); break; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- case FILE_DYNAMIC_CPULIST:
seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->prefer_cpus));
break;
+#endif default: ret = -EINVAL; } @@ -3047,7 +3162,15 @@ static struct cftype legacy_files[] = { .write_u64 = cpuset_write_u64, .private = FILE_MEMORY_PRESSURE_ENABLED, },
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- {
.name = "preferred_cpus",
.seq_show = cpuset_common_seq_show,
.write = cpuset_write_resmask,
.max_write_len = (100U + 6 * NR_CPUS),
.private = FILE_DYNAMIC_CPULIST,
- },
+#endif { } /* terminate */ };
@@ -3205,6 +3328,9 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->effective_mems = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); cpumask_copy(cs->effective_cpus, parent->cpus_allowed); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- cpumask_copy(cs->prefer_cpus, parent->prefer_cpus);
+#endif spin_unlock_irq(&callback_lock); out_unlock: percpu_up_write(&cpuset_rwsem); @@ -3358,6 +3484,9 @@ static void cpuset_fork(struct task_struct *task) return;
set_cpus_allowed_ptr(task, current->cpus_ptr);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
set_prefer_cpus_ptr(task, current->prefer_cpus);
+#endif task->mems_allowed = current->mems_allowed; return; } @@ -3404,17 +3533,26 @@ int __init cpuset_init(void) BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- BUG_ON(!alloc_cpumask_var(&top_cpuset.prefer_cpus, GFP_KERNEL));
+#endif
cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); cpumask_setall(top_cpuset.effective_cpus); nodes_setall(top_cpuset.effective_mems); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- cpumask_clear(top_cpuset.prefer_cpus);
+#endif
fmeter_init(&top_cpuset.fmeter); set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); top_cpuset.relax_domain_level = -1;
BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- BUG_ON(!alloc_cpumask_var(&prefer_cpus_attach, GFP_KERNEL));
+#endif
return 0; } @@ -3451,6 +3589,9 @@ hotplug_update_tasks_legacy(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- cpumask_t prefer_cpus;
+#endif bool is_empty;
spin_lock_irq(&callback_lock); @@ -3469,6 +3610,13 @@ hotplug_update_tasks_legacy(struct cpuset *cs, if (mems_updated && !nodes_empty(cs->mems_allowed)) update_tasks_nodemask(cs);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- if (!cpumask_subset(cs->prefer_cpus, cs->cpus_allowed)) {
cpumask_and(&prefer_cpus, cs->prefer_cpus, cs->cpus_allowed);
cpumask_copy(cs->prefer_cpus, &prefer_cpus);
update_tasks_prefer_cpumask(cs);
- }
+#endif is_empty = cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed);
diff --git a/kernel/fork.c b/kernel/fork.c index 41c964104b58..b6ba64f94822 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -628,6 +628,9 @@ void free_task(struct task_struct *tsk) if (tsk->flags & PF_KTHREAD) free_kthread_struct(tsk); bpf_task_storage_free(tsk); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- sched_prefer_cpus_free(tsk);
+#endif free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -1136,6 +1139,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->seccomp.filter = NULL; #endif
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- tsk->prefer_cpus = NULL;
+#endif
- setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk);
@@ -2360,6 +2367,12 @@ __latent_entropy struct task_struct *copy_process(
rt_mutex_init_task(p);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- retval = sched_prefer_cpus_fork(p, current->prefer_cpus);
- if (retval)
goto bad_fork_free;> +#endif
- lockdep_assert_irqs_enabled();
#ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a68d1276bab0..836a3b93ca6d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11330,6 +11330,101 @@ static int __maybe_unused cpu_period_quota_parse(char *buf, return 0; }
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +int sched_prefer_cpus_fork(struct task_struct *p, struct cpumask *mask) +{
- p->prefer_cpus = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
在 CONFIG 使能时,每个 task 都分配了 prefer_cpus, 在 fork 路径动态分配会增加 fork 的 overhead,如果不使用该特性,能否消除掉? 比如: 1. 增加启动开关,不使能时,这些动作不需要做; 2. 如果将 perfer_cpus 作为 task_struct 的一部分,是否管理起来更高效,直接随 task_struct 分配和释放。
该特性,能否默认不开启,需要时再开启?
- if (!p->prefer_cpus)
return -ENOMEM;
- if (mask)
cpumask_copy(p->prefer_cpus, mask);
- else
cpumask_clear(p->prefer_cpus);
- return 0;
+}
+void sched_prefer_cpus_free(struct task_struct *p) +{
- kfree(p->prefer_cpus);
+}
+static void do_set_prefer_cpus(struct task_struct *p,
const struct cpumask *new_mask)
+{
- struct rq *rq = task_rq(p);
- bool queued, running;
- lockdep_assert_held(&p->pi_lock);
- queued = task_on_rq_queued(p);
- running = task_current(rq, p);
- if (queued) {
/*
* Because __kthread_bind() calls this on blocked tasks without
* holding rq->lock.
*/
lockdep_assert_held(&rq->__lock);
dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
- }
- if (running)
put_prev_task(rq, p);
- cpumask_copy(p->prefer_cpus, new_mask);
- if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- if (running)
set_next_task(rq, p);
+}
+/*
- Change a given task's prefer CPU affinity. Prioritize migrate the thread to
- prefer cpus according to preferred bitmask.
- NOTE: the caller must have a valid reference to the task, the
- task must not exit() & deallocate itself prematurely. The
- call is not atomic; no spinlocks may be held.
- */
+static int __set_prefer_cpus_ptr(struct task_struct *p,
const struct cpumask *new_mask, bool check)
+{
- struct rq_flags rf;
- struct rq *rq;
- int ret = 0;
- if (unlikely(!p->prefer_cpus))
return -EINVAL;
- rq = task_rq_lock(p, &rf);
- update_rq_clock(rq);
- if (cpumask_equal(p->prefer_cpus, new_mask))
goto out;
- if (!cpumask_subset(new_mask, p->cpus_ptr)) {
ret = -EINVAL;
goto out;
- }
- do_set_prefer_cpus(p, new_mask);
+out:
- task_rq_unlock(rq, p, &rf);
- return ret;
+}
+int set_prefer_cpus_ptr(struct task_struct *p, const struct cpumask *new_mask) +{
- if (p->sched_class != &fair_sched_class)
return 0;
- return __set_prefer_cpus_ptr(p, new_mask, false);
+} +#endif
#ifdef CONFIG_CFS_BANDWIDTH static int cpu_max_show(struct seq_file *sf, void *v) {
-----邮件原件----- 发件人: Xiexiuqi 发送时间: 2023年7月25日 16:26 收件人: tanghui (C) tanghui20@huawei.com; kernel@openeuler.org 抄送: Chenhui (Judy) judy.chenhui@huawei.com; bobo.shaobowang@huawei.comwangshaobo 主题: Re: [PATCH openEuler-23.09 2/7] cpuset: Introduce new interface for scheduler dynamic affinity
On 2023/7/25 14:57, Hui Tang wrote:
From: tanghui tanghui20@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7NS6Y
Add 'prefer_cpus' sysfs and related interface in cgroup cpuset.
Signed-off-by: tanghui tanghui20@huawei.com
fs/proc/base.c | 73 ++++++++++++++++++++ include/linux/sched.h | 10 +++ init/init_task.c | 3 + kernel/cgroup/cpuset.c | 150 ++++++++++++++++++++++++++++++++++++++++- kernel/fork.c | 13 ++++ kernel/sched/core.c | 95 ++++++++++++++++++++++++++ 6 files changed, 343 insertions(+), 1 deletion(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c index 05452c3b9872..7183f338404d 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3165,6 +3165,76 @@ static const struct file_operations proc_setgroups_operations = { }; #endif /* CONFIG_USER_NS */
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+static int preferred_cpuset_show(struct seq_file *m, void *v) {
- struct inode *inode = m->private;
- struct task_struct *p;
- p = get_proc_task(inode);
- if (!p)
return -ESRCH;
- if (p->prefer_cpus)
seq_printf(m, "%*pbl\n", cpumask_pr_args(p->prefer_cpus));
- else
seq_putc(m, '\n');
- put_task_struct(p);
- return 0;
+}
+static ssize_t preferred_cpuset_write(struct file *file, const char __user *buf,
size_t count, loff_t *offset)
+{
- cpumask_var_t new_mask;
- int retval;
- struct inode *inode = file_inode(file);
- struct task_struct *p;
- p = get_proc_task(inode);
- if (!p)
return -ESRCH;
- if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
retval = -ENOMEM;
goto out_put_task;
- }
- retval = cpumask_parselist_user(buf, count, new_mask);
- if (retval < 0)
goto out_free_cpumask;
- retval = set_prefer_cpus_ptr(p, new_mask);
- if (retval < 0)
goto out_free_cpumask;
- retval = count;
+out_free_cpumask:
- free_cpumask_var(new_mask);
+out_put_task:
- put_task_struct(p);
- return retval;
+}
+static int preferred_cpuset_open(struct inode *inode, struct file +*filp) {
- return single_open(filp, preferred_cpuset_show, inode); }
+static const struct file_operations proc_preferred_cpuset_operations = {
- .open = preferred_cpuset_open,
- .write = preferred_cpuset_write,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
+}; +#endif
static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -3689,6 +3759,9 @@ static const struct pid_entry tid_base_stuff[] = { ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), #endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- REG("preferred_cpuset", 0644, proc_preferred_cpuset_operations),
+#endif };
static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/include/linux/sched.h b/include/linux/sched.h index eed5d65b8d1f..ce6208bfb530 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1534,6 +1534,10 @@ struct task_struct { struct user_event_mm *user_event_mm; #endif
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- cpumask_t *prefer_cpus;
+#endif
- /*
- New fields for task_struct should be added above here, so that
- they are included in the randomized portion of task_struct.
@@ -2444,4 +2448,10 @@ static inline void sched_core_fork(struct task_struct *p) { }
extern void sched_set_stop_task(int cpu, struct task_struct *stop);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY int +set_prefer_cpus_ptr(struct task_struct *p,
const struct cpumask *new_mask);
+int sched_prefer_cpus_fork(struct task_struct *p, struct cpumask +*mask); void sched_prefer_cpus_free(struct task_struct *p); #endif #endif diff --git a/init/init_task.c b/init/init_task.c index ff6c4b9bfe6b..ac0c5850f74b 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -207,6 +207,9 @@ struct task_struct init_task #ifdef CONFIG_SECURITY .security = NULL, #endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- .prefer_cpus = NULL,
+#endif #ifdef CONFIG_SECCOMP_FILTER .seccomp = { .filter_count = ATOMIC_INIT(0) }, #endif diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index e4ca2dd2b764..0862dbeca1c8 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -138,6 +138,9 @@ struct cpuset { /* user-configured CPUs and Memory Nodes allow to tasks */ cpumask_var_t cpus_allowed; nodemask_t mems_allowed; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- cpumask_var_t prefer_cpus;
+#endif
/* effective CPUs and Memory Nodes allow to tasks */ cpumask_var_t effective_cpus; @@ -227,6 +230,9 @@ static inline bool is_prs_invalid(int prs_state) struct tmpmasks { cpumask_var_t addmask, delmask; /* For partition root */ cpumask_var_t new_cpus; /* For update_cpumasks_hier() */ +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- cpumask_var_t prefer_cpus;
+#endif };
static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) @@ -597,15 +603,24 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) { cpumask_var_t *pmask1, *pmask2, *pmask3; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- cpumask_var_t *pmask4;
+#endif
if (cs) { pmask1 = &cs->cpus_allowed; pmask2 = &cs->effective_cpus; pmask3 = &cs->subparts_cpus; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
pmask4 = &cs->prefer_cpus;
+#endif } else { pmask1 = &tmp->new_cpus; pmask2 = &tmp->addmask; pmask3 = &tmp->delmask; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
pmask4 = &tmp->prefer_cpus;
+#endif }
if (!zalloc_cpumask_var(pmask1, GFP_KERNEL)) @@ -616,9 +631,17 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
if (!zalloc_cpumask_var(pmask3, GFP_KERNEL)) goto free_two; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- if (!zalloc_cpumask_var(pmask4, GFP_KERNEL))
goto free_three;
+#endif
return 0;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +free_three:
- free_cpumask_var(*pmask3);
+#endif free_two: free_cpumask_var(*pmask2); free_one: @@ -634,11 +657,17 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) { if (cs) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
free_cpumask_var(cs->prefer_cpus);
+#endif free_cpumask_var(cs->cpus_allowed); free_cpumask_var(cs->effective_cpus); free_cpumask_var(cs->subparts_cpus); } if (tmp) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
free_cpumask_var(tmp->prefer_cpus);
+#endif free_cpumask_var(tmp->new_cpus); free_cpumask_var(tmp->addmask); free_cpumask_var(tmp->delmask); @@ -662,6 +691,9 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) return NULL; }
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- cpumask_copy(trial->prefer_cpus, cs->prefer_cpus); #endif cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); cpumask_copy(trial->effective_cpus, cs->effective_cpus); return trial;
@@ -743,6 +775,12 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) if (cur == &top_cpuset) goto out;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- ret = -EINVAL;
- if (!cpumask_subset(cur->prefer_cpus, trial->cpus_allowed))
goto out;
+#endif
par = parent_cs(cur);
/*
@@ -791,6 +829,66 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) return ret; }
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY static cpumask_var_t +prefer_cpus_attach;
+static void update_tasks_prefer_cpumask(struct cpuset *cs) {
- struct css_task_iter it;
- struct task_struct *task;
- css_task_iter_start(&cs->css, 0, &it);
- while ((task = css_task_iter_next(&it)))
set_prefer_cpus_ptr(task, cs->prefer_cpus);
- css_task_iter_end(&it);
+}
+/*
- update_prefer_cpumask - update the prefer_cpus mask of a cpuset and
all tasks in it
- @cs: the cpuset to consider
- @trialcs: trial cpuset
- @buf: buffer of cpu numbers written to this cpuset */ static int
+update_prefer_cpumask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
+{
- int retval;
- if (cs == &top_cpuset)
return -EACCES;
- /*
* An empty prefer_cpus is ok which mean that the cpuset tasks disable
* dynamic affinity feature.
* Since cpulist_parse() fails on an empty mask, we special case
* that parsing.
*/
- if (!*buf) {
cpumask_clear(trialcs->prefer_cpus);
- } else {
retval = cpulist_parse(buf, trialcs->prefer_cpus);
if (retval < 0)
return retval;
- }
- /* Nothing to do if the cpus didn't change */
- if (cpumask_equal(cs->prefer_cpus, trialcs->prefer_cpus))
return 0;
- if (!cpumask_subset(trialcs->prefer_cpus, cs->cpus_allowed))
return -EINVAL;
- update_tasks_prefer_cpumask(trialcs);
- spin_lock_irq(&callback_lock);
- cpumask_copy(cs->prefer_cpus, trialcs->prefer_cpus);
- spin_unlock_irq(&callback_lock);
- return 0;
+} +#endif
#ifdef CONFIG_SMP /*
- Helper routine for generate_sched_domains().
@@ -2547,6 +2645,10 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) * fail. TODO: have a better way to handle failure here */ WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
cpumask_copy(prefer_cpus_attach, cs->prefer_cpus);
set_prefer_cpus_ptr(task, prefer_cpus_attach); #endif
cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); cpuset_update_task_spread_flags(cs, task); @@ -2648,6 +2750,9 @@
typedef enum { FILE_MEMORY_PRESSURE, FILE_SPREAD_PAGE, FILE_SPREAD_SLAB, +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- FILE_DYNAMIC_CPULIST,
+#endif } cpuset_filetype_t;
static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, @@ -2778,6 +2883,11 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, case FILE_MEMLIST: retval = update_nodemask(cs, trialcs, buf); break; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- case FILE_DYNAMIC_CPULIST:
retval = update_prefer_cpumask(cs, trialcs, buf);
break;
+#endif default: retval = -EINVAL; break; @@ -2825,6 +2935,11 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) case FILE_SUBPARTS_CPULIST: seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus)); break; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- case FILE_DYNAMIC_CPULIST:
seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->prefer_cpus));
break;
+#endif default: ret = -EINVAL; } @@ -3047,7 +3162,15 @@ static struct cftype legacy_files[] = { .write_u64 = cpuset_write_u64, .private = FILE_MEMORY_PRESSURE_ENABLED, },
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- {
.name = "preferred_cpus",
.seq_show = cpuset_common_seq_show,
.write = cpuset_write_resmask,
.max_write_len = (100U + 6 * NR_CPUS),
.private = FILE_DYNAMIC_CPULIST,
- },
+#endif { } /* terminate */ };
@@ -3205,6 +3328,9 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->effective_mems = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); cpumask_copy(cs->effective_cpus, parent->cpus_allowed); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- cpumask_copy(cs->prefer_cpus, parent->prefer_cpus); #endif spin_unlock_irq(&callback_lock);
out_unlock: percpu_up_write(&cpuset_rwsem); @@ -3358,6 +3484,9 @@ static void cpuset_fork(struct task_struct *task) return;
set_cpus_allowed_ptr(task, current->cpus_ptr);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
task->mems_allowed = current->mems_allowed; return; }set_prefer_cpus_ptr(task, current->prefer_cpus); #endif
@@ -3404,17 +3533,26 @@ int __init cpuset_init(void) BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- BUG_ON(!alloc_cpumask_var(&top_cpuset.prefer_cpus, GFP_KERNEL));
+#endif
cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); cpumask_setall(top_cpuset.effective_cpus); nodes_setall(top_cpuset.effective_mems); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- cpumask_clear(top_cpuset.prefer_cpus);
+#endif
fmeter_init(&top_cpuset.fmeter); set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); top_cpuset.relax_domain_level = -1;
BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
BUG_ON(!alloc_cpumask_var(&prefer_cpus_attach, GFP_KERNEL)); #endif
return 0;
} @@ -3451,6 +3589,9 @@ hotplug_update_tasks_legacy(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- cpumask_t prefer_cpus;
+#endif bool is_empty;
spin_lock_irq(&callback_lock); @@ -3469,6 +3610,13 @@ hotplug_update_tasks_legacy(struct cpuset *cs, if (mems_updated && !nodes_empty(cs->mems_allowed)) update_tasks_nodemask(cs);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- if (!cpumask_subset(cs->prefer_cpus, cs->cpus_allowed)) {
cpumask_and(&prefer_cpus, cs->prefer_cpus, cs->cpus_allowed);
cpumask_copy(cs->prefer_cpus, &prefer_cpus);
update_tasks_prefer_cpumask(cs);
- }
+#endif is_empty = cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed);
diff --git a/kernel/fork.c b/kernel/fork.c index 41c964104b58..b6ba64f94822 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -628,6 +628,9 @@ void free_task(struct task_struct *tsk) if (tsk->flags & PF_KTHREAD) free_kthread_struct(tsk); bpf_task_storage_free(tsk); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- sched_prefer_cpus_free(tsk);
+#endif free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -1136,6 +1139,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->seccomp.filter = NULL; #endif
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- tsk->prefer_cpus = NULL;
+#endif
- setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk);
@@ -2360,6 +2367,12 @@ __latent_entropy struct task_struct *copy_process(
rt_mutex_init_task(p);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
- retval = sched_prefer_cpus_fork(p, current->prefer_cpus);
- if (retval)
goto bad_fork_free;> +#endif
- lockdep_assert_irqs_enabled();
#ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a68d1276bab0..836a3b93ca6d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11330,6 +11330,101 @@ static int __maybe_unused cpu_period_quota_parse(char *buf, return 0; }
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY int +sched_prefer_cpus_fork(struct task_struct *p, struct cpumask *mask) {
- p->prefer_cpus = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
在 CONFIG 使能时,每个 task 都分配了 prefer_cpus, 在 fork 路径动态分配会增加 fork 的 overhead,如果不使用该特性,能否消除掉? 比如: 1. 增加启动开关,不使能时,这些动作不需要做; 2. 如果将 perfer_cpus 作为 task_struct 的一部分,是否管理起来更高效,直接随 task_struct 分配和释放。
该特性,能否默认不开启,需要时再开启?
------ patch 6 增加了cmdline, 默认不使能,需要场景通过cmdline打开
- if (!p->prefer_cpus)
return -ENOMEM;
- if (mask)
cpumask_copy(p->prefer_cpus, mask);
- else
cpumask_clear(p->prefer_cpus);
- return 0;
+}
+void sched_prefer_cpus_free(struct task_struct *p) {
- kfree(p->prefer_cpus);
+}
+static void do_set_prefer_cpus(struct task_struct *p,
const struct cpumask *new_mask)
+{
- struct rq *rq = task_rq(p);
- bool queued, running;
- lockdep_assert_held(&p->pi_lock);
- queued = task_on_rq_queued(p);
- running = task_current(rq, p);
- if (queued) {
/*
* Because __kthread_bind() calls this on blocked tasks without
* holding rq->lock.
*/
lockdep_assert_held(&rq->__lock);
dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
- }
- if (running)
put_prev_task(rq, p);
- cpumask_copy(p->prefer_cpus, new_mask);
- if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- if (running)
set_next_task(rq, p);
+}
+/*
- Change a given task's prefer CPU affinity. Prioritize migrate the
+thread to
- prefer cpus according to preferred bitmask.
- NOTE: the caller must have a valid reference to the task, the
- task must not exit() & deallocate itself prematurely. The
- call is not atomic; no spinlocks may be held.
- */
+static int __set_prefer_cpus_ptr(struct task_struct *p,
const struct cpumask *new_mask, bool check) {
- struct rq_flags rf;
- struct rq *rq;
- int ret = 0;
- if (unlikely(!p->prefer_cpus))
return -EINVAL;
- rq = task_rq_lock(p, &rf);
- update_rq_clock(rq);
- if (cpumask_equal(p->prefer_cpus, new_mask))
goto out;
- if (!cpumask_subset(new_mask, p->cpus_ptr)) {
ret = -EINVAL;
goto out;
- }
- do_set_prefer_cpus(p, new_mask);
+out:
- task_rq_unlock(rq, p, &rf);
- return ret;
+}
+int set_prefer_cpus_ptr(struct task_struct *p, const struct cpumask +*new_mask) {
- if (p->sched_class != &fair_sched_class)
return 0;
- return __set_prefer_cpus_ptr(p, new_mask, false); } #endif
#ifdef CONFIG_CFS_BANDWIDTH static int cpu_max_show(struct seq_file *sf, void *v) {
From: tanghui tanghui20@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7NS6Y
--------------------------------
Compare taskgroup 'util_avg' in perferred cpu with capacity preferred cpu, dynamicly adjust cpu range for task wakeup process.
Signed-off-by: tanghui tanghui20@huawei.com --- include/linux/sched.h | 1 + include/linux/sched/sysctl.h | 3 + kernel/sched/fair.c | 133 +++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 11 +++ 4 files changed, 148 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h index ce6208bfb530..238ac254af0d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1536,6 +1536,7 @@ struct task_struct {
#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY cpumask_t *prefer_cpus; + const cpumask_t *select_cpus; #endif
/* diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 5a64582b086b..ede157a678f8 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -29,4 +29,7 @@ extern int sysctl_numa_balancing_mode; #define sysctl_numa_balancing_mode 0 #endif
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +extern int sysctl_sched_util_low_pct; +#endif #endif /* _LINUX_SCHED_SYSCTL_H */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 373ff5f55884..604af885baa7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6704,7 +6704,11 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this return cpumask_first(sched_group_span(group));
/* Traverse only the allowed CPUs */ +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + for_each_cpu_and(i, sched_group_span(group), p->select_cpus) { +#else for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { +#endif struct rq *rq = cpu_rq(i);
if (!sched_core_cookie_match(rq, p)) @@ -6751,7 +6755,11 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p { int new_cpu = cpu;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (!cpumask_intersects(sched_domain_span(sd), p->select_cpus)) +#else if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr)) +#endif return prev_cpu;
/* @@ -6875,7 +6883,11 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu if (!available_idle_cpu(cpu)) { idle = false; if (*idle_cpu == -1) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->select_cpus)) { +#else if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) { +#endif *idle_cpu = cpu; break; } @@ -6901,7 +6913,11 @@ static int select_idle_smt(struct task_struct *p, int target) { int cpu;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + for_each_cpu_and(cpu, cpu_smt_mask(target), p->select_cpus) { +#else for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) { +#endif if (cpu == target) continue; if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) @@ -6949,7 +6965,11 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool struct sched_domain *this_sd = NULL; u64 time = 0;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_and(cpus, sched_domain_span(sd), p->select_cpus); +#else cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); +#endif
if (sched_feat(SIS_PROP) && !has_idle_core) { u64 avg_cost, avg_idle, span_avg; @@ -7122,6 +7142,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) lockdep_assert_irqs_disabled();
if ((available_idle_cpu(target) || sched_idle_cpu(target)) && +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_test_cpu(target, p->select_cpus) && +#endif asym_fits_cpu(task_util, util_min, util_max, target)) return target;
@@ -7130,6 +7153,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ if (prev != target && cpus_share_cache(prev, target) && (available_idle_cpu(prev) || sched_idle_cpu(prev)) && +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_test_cpu(prev, p->select_cpus) && +#endif asym_fits_cpu(task_util, util_min, util_max, prev)) return prev;
@@ -7156,7 +7182,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_test_cpu(p->recent_used_cpu, p->select_cpus) && +#else cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && +#endif asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { return recent_used_cpu; } @@ -7638,6 +7668,82 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) return target; }
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +/* + * Low utilization threshold for CPU + * + * (default: 85%), units: percentage of CPU utilization) + */ +int sysctl_sched_util_low_pct = 85; + +static inline bool prefer_cpus_valid(struct task_struct *p) +{ + return p->prefer_cpus && + !cpumask_empty(p->prefer_cpus) && + !cpumask_equal(p->prefer_cpus, p->cpus_ptr) && + cpumask_subset(p->prefer_cpus, p->cpus_ptr); +} + +/* + * set_task_select_cpus: select the cpu range for task + * @p: the task whose available cpu range will to set + * @idlest_cpu: the cpu which is the idlest in prefer cpus + * + * If sum of 'util_avg' among 'preferred_cpus' lower than the percentage + * 'sysctl_sched_util_low_pct' of 'preferred_cpus' capacity, select + * 'preferred_cpus' range for task, otherwise select 'preferred_cpus' for task. + * + * The available cpu range set to p->select_cpus. Idlest cpu in preferred cpus + * set to @idlest_cpu, which is set to wakeup cpu when fast path wakeup cpu + * without p->select_cpus. + */ +static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu, + int sd_flag) +{ + unsigned long util_avg_sum = 0; + unsigned long tg_capacity = 0; + long min_util = INT_MIN; + struct task_group *tg; + long spare; + int cpu; + + p->select_cpus = p->cpus_ptr; + if (!prefer_cpus_valid(p)) + return; + + rcu_read_lock(); + tg = task_group(p); + for_each_cpu(cpu, p->prefer_cpus) { + if (unlikely(!tg->se[cpu])) + continue; + + if (idlest_cpu && available_idle_cpu(cpu)) { + *idlest_cpu = cpu; + } else if (idlest_cpu) { + spare = (long)(capacity_of(cpu) - tg->se[cpu]->avg.util_avg); + if (spare > min_util) { + min_util = spare; + *idlest_cpu = cpu; + } + } + + if (available_idle_cpu(cpu)) { + rcu_read_unlock(); + p->select_cpus = p->prefer_cpus; + return; + } + + util_avg_sum += tg->se[cpu]->avg.util_avg; + tg_capacity += capacity_of(cpu); + } + rcu_read_unlock(); + + if (tg_capacity > cpumask_weight(p->prefer_cpus) && + util_avg_sum * 100 <= tg_capacity * sysctl_sched_util_low_pct) + p->select_cpus = p->prefer_cpus; +} +#endif + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE, @@ -7658,11 +7764,19 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) int want_affine = 0; /* SD_flags and WF_flags share the first nibble */ int sd_flag = wake_flags & 0xF; +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + int idlest_cpu = 0; +#endif
/* * required for stable ->cpus_allowed */ lockdep_assert_held(&p->pi_lock); + +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + set_task_select_cpus(p, &idlest_cpu, sd_flag); +#endif + if (wake_flags & WF_TTWU) { record_wakee(p);
@@ -7673,7 +7787,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) new_cpu = prev_cpu; }
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->select_cpus); +#else want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr); +#endif }
rcu_read_lock(); @@ -7684,7 +7802,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) */ if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + new_cpu = cpu; + if (cpu != prev_cpu && + cpumask_test_cpu(prev_cpu, p->select_cpus)) +#else if (cpu != prev_cpu) +#endif new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
sd = NULL; /* Prefer wake_affine over balance flags */ @@ -7711,6 +7835,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) } rcu_read_unlock();
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (!cpumask_test_cpu(new_cpu, p->select_cpus)) + new_cpu = idlest_cpu; +#endif return new_cpu; }
@@ -9860,8 +9988,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) int local_group;
/* Skip over this group if it has no CPUs allowed */ +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (!cpumask_intersects(sched_group_span(group), + p->select_cpus)) +#else if (!cpumask_intersects(sched_group_span(group), p->cpus_ptr)) +#endif continue;
/* Skip over this group if no cookie matched */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index bfe53e835524..acc20b417dc8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2054,6 +2054,17 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_INT_MAX, }, +#endif +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + { + .procname = "sched_util_low_pct", + .data = &sysctl_sched_util_low_pct, + .maxlen = sizeof(sysctl_sched_util_low_pct), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, #endif { } };
From: tanghui tanghui20@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7NS6Y
--------------------------------
Not allow task to migrate out of cpu preferred.
Signed-off-by: tanghui tanghui20@huawei.com --- kernel/sched/fair.c | 9 +++++++++ 1 file changed, 9 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 604af885baa7..deb0bb55b3ae 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8655,7 +8655,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (kthread_is_per_cpu(p)) return 0;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + set_task_select_cpus(p, NULL, 0); + if (!cpumask_test_cpu(env->dst_cpu, p->select_cpus)) { +#else if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { +#endif int cpu;
schedstat_inc(p->stats.nr_failed_migrations_affine); @@ -8678,7 +8683,11 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/* Prevent to re-select dst_cpu via env's CPUs: */ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + if (cpumask_test_cpu(cpu, p->select_cpus)) { +#else if (cpumask_test_cpu(cpu, p->cpus_ptr)) { +#endif env->flags |= LBF_DST_PINNED; env->new_dst_cpu = cpu; break;
From: tanghui tanghui20@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7NS6Y
--------------------------------
Signed-off-by: tanghui tanghui20@huawei.com --- include/linux/sched.h | 5 +++++ kernel/sched/debug.c | 4 ++++ kernel/sched/fair.c | 11 +++++++++-- 3 files changed, 18 insertions(+), 2 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 238ac254af0d..eabe48b9f2d0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -543,6 +543,11 @@ struct sched_statistics { #ifdef CONFIG_SCHED_CORE u64 core_forceidle_sum; #endif + +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + u64 nr_wakeups_preferred_cpus; + u64 nr_wakeups_force_preferred_cpus; +#endif #endif /* CONFIG_SCHEDSTATS */ } ____cacheline_aligned;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0b2340a79b65..7339abce91f4 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1042,6 +1042,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P_SCHEDSTAT(nr_wakeups_affine_attempts); P_SCHEDSTAT(nr_wakeups_passive); P_SCHEDSTAT(nr_wakeups_idle); +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + P_SCHEDSTAT(nr_wakeups_preferred_cpus); + P_SCHEDSTAT(nr_wakeups_force_preferred_cpus); +#endif
avg_atom = p->se.sum_exec_runtime; if (nr_switches) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index deb0bb55b3ae..c23f20eea32a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7730,6 +7730,8 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu, if (available_idle_cpu(cpu)) { rcu_read_unlock(); p->select_cpus = p->prefer_cpus; + if (sd_flag & SD_BALANCE_WAKE) + schedstat_inc(p->stats.nr_wakeups_preferred_cpus); return; }
@@ -7739,8 +7741,11 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu, rcu_read_unlock();
if (tg_capacity > cpumask_weight(p->prefer_cpus) && - util_avg_sum * 100 <= tg_capacity * sysctl_sched_util_low_pct) + util_avg_sum * 100 <= tg_capacity * sysctl_sched_util_low_pct) { p->select_cpus = p->prefer_cpus; + if (sd_flag & SD_BALANCE_WAKE) + schedstat_inc(p->stats.nr_wakeups_preferred_cpus); + } } #endif
@@ -7836,8 +7841,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) rcu_read_unlock();
#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY - if (!cpumask_test_cpu(new_cpu, p->select_cpus)) + if (!cpumask_test_cpu(new_cpu, p->select_cpus)) { new_cpu = idlest_cpu; + schedstat_inc(p->stats.nr_wakeups_force_preferred_cpus); + } #endif return new_cpu; }
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7NS6Y
--------------------------------
Add cmdline 'dynamic_affinity' to control dynamic affinity feature, which is disabled by default.
Signed-off-by: Hui Tang tanghui20@huawei.com --- include/linux/sched.h | 6 ++++++ kernel/cgroup/cpuset.c | 3 +++ kernel/fork.c | 11 +++++++---- kernel/sched/core.c | 3 +++ kernel/sched/debug.c | 6 ++++-- kernel/sched/fair.c | 13 +++++++++++++ 6 files changed, 36 insertions(+), 6 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h index eabe48b9f2d0..873a9b43e63e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2459,5 +2459,11 @@ int set_prefer_cpus_ptr(struct task_struct *p, const struct cpumask *new_mask); int sched_prefer_cpus_fork(struct task_struct *p, struct cpumask *mask); void sched_prefer_cpus_free(struct task_struct *p); + +extern struct static_key_false __dynamic_affinity_switch; +static inline bool dynamic_affinity_enabled(void) +{ + return static_branch_unlikely(&__dynamic_affinity_switch); +} #endif #endif diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 0862dbeca1c8..cbc9c3c06efa 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -858,6 +858,9 @@ static int update_prefer_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cs == &top_cpuset) return -EACCES;
+ if (!dynamic_affinity_enabled()) + return -EPERM; + /* * An empty prefer_cpus is ok which mean that the cpuset tasks disable * dynamic affinity feature. diff --git a/kernel/fork.c b/kernel/fork.c index b6ba64f94822..a721784458d9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -629,7 +629,8 @@ void free_task(struct task_struct *tsk) free_kthread_struct(tsk); bpf_task_storage_free(tsk); #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY - sched_prefer_cpus_free(tsk); + if (dynamic_affinity_enabled()) + sched_prefer_cpus_free(tsk); #endif free_task_struct(tsk); } @@ -2368,9 +2369,11 @@ __latent_entropy struct task_struct *copy_process( rt_mutex_init_task(p);
#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY - retval = sched_prefer_cpus_fork(p, current->prefer_cpus); - if (retval) - goto bad_fork_free; + if (dynamic_affinity_enabled()) { + retval = sched_prefer_cpus_fork(p, current->prefer_cpus); + if (retval) + goto bad_fork_free; + } #endif
lockdep_assert_irqs_enabled(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 836a3b93ca6d..b0d860fd35af 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11395,6 +11395,9 @@ static int __set_prefer_cpus_ptr(struct task_struct *p, struct rq *rq; int ret = 0;
+ if (!dynamic_affinity_enabled()) + return -EPERM; + if (unlikely(!p->prefer_cpus)) return -EINVAL;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 7339abce91f4..940e191d7722 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1043,8 +1043,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P_SCHEDSTAT(nr_wakeups_passive); P_SCHEDSTAT(nr_wakeups_idle); #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY - P_SCHEDSTAT(nr_wakeups_preferred_cpus); - P_SCHEDSTAT(nr_wakeups_force_preferred_cpus); + if (dynamic_affinity_enabled()) { + P_SCHEDSTAT(nr_wakeups_preferred_cpus); + P_SCHEDSTAT(nr_wakeups_force_preferred_cpus); + } #endif
avg_atom = p->se.sum_exec_runtime; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c23f20eea32a..8ee3a435144e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7669,6 +7669,16 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) }
#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + +DEFINE_STATIC_KEY_FALSE(__dynamic_affinity_switch); + +static int __init dynamic_affinity_switch_setup(char *__unused) +{ + static_branch_enable(&__dynamic_affinity_switch); + return 1; +} +__setup("dynamic_affinity", dynamic_affinity_switch_setup); + /* * Low utilization threshold for CPU * @@ -7678,6 +7688,9 @@ int sysctl_sched_util_low_pct = 85;
static inline bool prefer_cpus_valid(struct task_struct *p) { + if (!dynamic_affinity_enabled()) + return false; + return p->prefer_cpus && !cpumask_empty(p->prefer_cpus) && !cpumask_equal(p->prefer_cpus, p->cpus_ptr) &&
From: tanghui tanghui20@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7NS6Y
--------------------------------
Signed-off-by: tanghui tanghui20@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + 2 files changed, 2 insertions(+)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 63abdb3f8c63..72ee3934408a 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -164,6 +164,7 @@ CONFIG_FAIR_GROUP_SCHED=y CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y CONFIG_SCHED_MM_CID=y +CONFIG_QOS_SCHED_DYNAMIC_AFFINITY=y CONFIG_CGROUP_PIDS=y CONFIG_CGROUP_RDMA=y CONFIG_CGROUP_FREEZER=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 6bc7626016d9..bc349a9f4fad 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -187,6 +187,7 @@ CONFIG_FAIR_GROUP_SCHED=y CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y CONFIG_SCHED_MM_CID=y +CONFIG_QOS_SCHED_DYNAMIC_AFFINITY=y CONFIG_CGROUP_PIDS=y CONFIG_CGROUP_RDMA=y CONFIG_CGROUP_FREEZER=y
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,转换为PR失败! 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/Z... 失败原因:补丁集缺失封面信息 建议解决方法:请提供补丁集并重新发送您的补丁集到邮件列表
FeedBack: The patch(es) which you have sent to kernel@openeuler.org has been converted to PR failed! Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/Z... Failed Reason: the cover of the patches is missing Suggest Solution: please checkout and apply the patches' cover and send all again