From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7X7WW
--------------------------------
Add three hooks of sched type in select_task_rq_fair(), as follows: 'cfs_select_rq' Replace the original core selection policy or implement dynamic CPU affinity.
'cfs_select_rq_exit' Restoring the CPU affinity of the task before exiting of 'select_task_rq_fair'.
To be used with 'cfs_select_rq' hook to implement dynamic CPU affinity.
'cfs_wake_affine' Determine on which CPU task can run soonest. Allow user to implement deferent policies.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com Signed-off-by: Guan Jing guanjing6@huawei.com --- include/linux/sched.h | 18 +++++++++ include/linux/sched_hook_defs.h | 3 ++ kernel/sched/core.c | 11 ++++++ kernel/sched/fair.c | 66 +++++++++++++++++++++++++++++++++ scripts/bpf_doc.py | 4 ++ 5 files changed, 102 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 492f2955a602..ae7cb5bb40e9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2530,6 +2530,24 @@ enum cpumask_op_type { CPUMASK_CPULIST_PARSE };
+struct sched_migrate_ctx { + struct task_struct *task; + struct cpumask *select_idle_mask; + int prev_cpu; + int curr_cpu; + int is_sync; + int want_affine; + int wake_flags; + int sd_flag; + int new_cpu; +}; + +struct sched_affine_ctx { + struct task_struct *task; + int prev_cpu; + int curr_cpu; + int is_sync; +}; #endif
#endif diff --git a/include/linux/sched_hook_defs.h b/include/linux/sched_hook_defs.h index 9350f2b7148e..f8d1381a1a9a 100644 --- a/include/linux/sched_hook_defs.h +++ b/include/linux/sched_hook_defs.h @@ -7,3 +7,6 @@ BPF_SCHED_HOOK(int, 0, cfs_tag_pick_next_entity, const struct sched_entity *curr const struct sched_entity *next) BPF_SCHED_HOOK(void, (void) 0, cfs_enqueue_task, struct rq *rq, struct task_struct *p) BPF_SCHED_HOOK(void, (void) 0, cfs_dequeue_task, struct rq *rq, struct task_struct *p) +BPF_SCHED_HOOK(int, -1, cfs_select_rq, struct sched_migrate_ctx *ctx) +BPF_SCHED_HOOK(int, -1, cfs_wake_affine, struct sched_affine_ctx *ctx) +BPF_SCHED_HOOK(int, -1, cfs_select_rq_exit, struct sched_migrate_ctx *ctx) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2883b81bc3f2..53f1e765412e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9875,6 +9875,10 @@ LIST_HEAD(task_groups); static struct kmem_cache *task_group_cache __read_mostly; #endif
+#ifdef CONFIG_BPF_SCHED +DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); +#endif + void __init sched_init(void) { unsigned long ptr = 0; @@ -9919,6 +9923,13 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ }
+#if defined(CONFIG_CPUMASK_OFFSTACK) && defined(CONFIG_BPF_SCHED) + for_each_possible_cpu(i) { + per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node( + cpumask_size(), GFP_KERNEL, cpu_to_node(i)); + } +#endif + init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
#ifdef CONFIG_SMP diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 03b2996cfb1b..c61ea896fcae 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -72,6 +72,10 @@ unsigned int sysctl_sched_latency = 6000000ULL; static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
+#ifdef CONFIG_BPF_SCHED +DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); +#endif + /* * The initial- and re-scaling of tunables is configurable * @@ -6703,6 +6707,22 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, { int target = nr_cpumask_bits;
+#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + struct sched_affine_ctx ctx; + int ret; + + ctx.task = p; + ctx.prev_cpu = prev_cpu; + ctx.curr_cpu = this_cpu; + ctx.is_sync = sync; + + ret = bpf_sched_cfs_wake_affine(&ctx); + if (ret >= 0 && ret < nr_cpumask_bits) + return ret; + } +#endif + if (sched_feat(WA_IDLE)) target = wake_affine_idle(this_cpu, prev_cpu, sync);
@@ -7821,6 +7841,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) int idlest_cpu = 0; #endif
+#ifdef CONFIG_BPF_SCHED + struct sched_migrate_ctx ctx; + cpumask_t *cpus_prev = NULL; + cpumask_t *cpus; + int ret; +#endif + /* * required for stable ->cpus_allowed */ @@ -7848,6 +7875,32 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) }
rcu_read_lock(); +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + ctx.task = p; + ctx.prev_cpu = prev_cpu; + ctx.curr_cpu = cpu; + ctx.is_sync = sync; + ctx.wake_flags = wake_flags; + ctx.want_affine = want_affine; + ctx.sd_flag = sd_flag; + ctx.select_idle_mask = this_cpu_cpumask_var_ptr(select_idle_mask); + + ret = bpf_sched_cfs_select_rq(&ctx); + if (ret >= 0) { + rcu_read_unlock(); + return ret; + } else if (ret != -1) { + cpus = this_cpu_cpumask_var_ptr(select_idle_mask); + if (cpumask_subset(cpus, p->cpus_ptr) && + !cpumask_empty(cpus)) { + cpus_prev = (void *)p->cpus_ptr; + p->cpus_ptr = cpus; + } + } + } +#endif + for_each_domain(cpu, tmp) { /* * If both 'cpu' and 'prev_cpu' are part of this domain, @@ -7886,6 +7939,19 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) /* Fast path */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); } + +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + ctx.new_cpu = new_cpu; + ret = bpf_sched_cfs_select_rq_exit(&ctx); + if (ret >= 0) + new_cpu = ret; + + if (cpus_prev) + p->cpus_ptr = cpus_prev; + } +#endif + rcu_read_unlock();
#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index 7f5952a3be61..d17415b53a88 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -707,6 +707,8 @@ class PrinterHelpers(Printer): 'struct sched_entity', 'struct cpumask', 'struct cpumask_op_args', + 'struct sched_migrate_ctx', + 'struct sched_affine_ctx', ] known_types = { '...', @@ -769,6 +771,8 @@ class PrinterHelpers(Printer): 'struct sched_entity', 'struct cpumask', 'struct cpumask_op_args', + 'struct sched_migrate_ctx', + 'struct sched_affine_ctx', } mapped_types = { 'u8': '__u8',