hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9GZAQ CVE: NA
--------------------------------
Implements a mechanism to periodically adjust preferred numa nodes for relationship group, and allows different adjustment policies to be customized in bpf program.
Signed-off-by: Hui Tang tanghui20@huawei.com --- include/linux/sched/relationship.h | 24 +++++++++++ include/linux/sched_hook_defs.h | 2 + kernel/sched/fair.c | 67 +++++++++++++++++++++++++++++- kernel/sched/relationship.c | 4 ++ 4 files changed, 95 insertions(+), 2 deletions(-)
diff --git a/include/linux/sched/relationship.h b/include/linux/sched/relationship.h index 45861f66ac4e..fbc5c2bab5dc 100644 --- a/include/linux/sched/relationship.h +++ b/include/linux/sched/relationship.h @@ -113,7 +113,25 @@ struct task_relationship {
/* extras numa fault data */ struct numa_fault_ext faults; + +#ifdef CONFIG_NUMA_BALANCING + /* preferred nodes adjust */ + u64 node_stamp; + struct callback_head node_work; +#endif +}; + +#ifdef CONFIG_BPF_SCHED +struct sched_preferred_node_ctx { + struct task_struct *tsk; + nodemask_t preferred_node; + + KABI_RESERVE(1) + KABI_RESERVE(2) + KABI_RESERVE(3) + KABI_RESERVE(4) }; +#endif
extern void task_relationship_enable(void); extern void task_relationship_disable(void); @@ -140,6 +158,9 @@ extern void sched_get_relationship(struct task_struct *tsk, struct bpf_relationship_get_args *args); extern void numa_faults_update_and_sort(int nid, int new, struct fault_array_info *stats); +extern void task_tick_relationship(struct rq *rq, struct task_struct *curr); + +extern void task_preferred_node_work(struct callback_head *work);
DECLARE_STATIC_KEY_FALSE(__relationship_switch); static inline bool task_relationship_used(void) @@ -167,6 +188,9 @@ sched_net_relationship_submit(struct net_relationship_req *req) { return 0; } + +static inline void +task_tick_relationship(struct rq *rq, struct task_struct *curr) {} #endif
#endif diff --git a/include/linux/sched_hook_defs.h b/include/linux/sched_hook_defs.h index e2519a00aa6b..0a871f728c85 100644 --- a/include/linux/sched_hook_defs.h +++ b/include/linux/sched_hook_defs.h @@ -12,3 +12,5 @@ BPF_SCHED_HOOK(int, -1, cfs_wake_affine, struct sched_affine_ctx *ctx) BPF_SCHED_HOOK(int, -1, cfs_select_rq_exit, struct sched_migrate_ctx *ctx) BPF_SCHED_HOOK(int, -1, cfs_can_migrate_task, struct task_struct *p, struct sched_migrate_node *migrate_node) +BPF_SCHED_HOOK(void, (void) 0, cfs_change_preferred_node, + struct sched_preferred_node_ctx *ctx) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 439eb7f9791d..3322c4ececd0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1090,6 +1090,8 @@ struct numa_group { struct fault_array_info score_ordered[FAULT_NODES_MAX]; struct fault_array_info faults_ordered[FAULT_NODES_MAX]; nodemask_t preferred_nid; + u64 node_stamp; + u64 nodes_switch_cnt; #endif /* * Faults_cpu is used to decide whether memory should move @@ -2547,6 +2549,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, #ifdef CONFIG_SCHED_TASK_RELATIONSHIP if (task_relationship_used()) { grp->preferred_nid = NODE_MASK_NONE; + grp->node_stamp = jiffies; for (i = 0; i < FAULT_NODES_MAX; i++) { grp->faults_ordered[i].nid = -1; grp->score_ordered[i].nid = -1; @@ -13271,6 +13274,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) update_overutilized_status(task_rq(curr));
task_tick_core(rq, curr); + + task_tick_relationship(rq, curr); }
/* @@ -13861,8 +13866,9 @@ void sched_show_relationship(struct task_struct *p, struct seq_file *m)
ng = rcu_dereference(p->numa_group); if (ng) { - seq_printf(m, "numa group preferred nid %*pbl\n", - nodemask_pr_args(&ng->preferred_nid)); + seq_printf(m, "numa group preferred nid %*pbl switch_cnt %llu\n", + nodemask_pr_args(&ng->preferred_nid), + ng->nodes_switch_cnt); }
net_grp = rcu_dereference(p->rship->net_group); @@ -13877,6 +13883,63 @@ void sched_show_relationship(struct task_struct *p, struct seq_file *m) } #endif /* CONFIG_SCHED_DEBUG */
+#ifdef CONFIG_SCHED_TASK_RELATIONSHIP +void task_preferred_node_work(struct callback_head *work) +{ +#ifdef CONFIG_NUMA_BALANCING + struct task_struct *curr = current; + struct numa_group *numa_grp; +#ifdef CONFIG_BPF_SCHED + struct sched_preferred_node_ctx ctx = {0}; +#endif + + work->next = work; + +#ifdef CONFIG_BPF_SCHED + numa_grp = deref_curr_numa_group(curr); + if (numa_grp) { + + spin_lock_irq(&numa_grp->lock); + ctx.tsk = curr; + ctx.preferred_node = numa_grp->preferred_nid; + bpf_sched_cfs_change_preferred_node(&ctx); + spin_unlock_irq(&numa_grp->lock); + } +#endif +#endif +} + +void task_tick_relationship(struct rq *rq, struct task_struct *curr) +{ +#ifdef CONFIG_NUMA_BALANCING + struct callback_head *work = &curr->rship->node_work; + struct numa_group *numa_grp; + + if (!task_relationship_supported(curr)) + return; + + if (work->next != work) + return; + + numa_grp = deref_curr_numa_group(curr); + if (!numa_grp || numa_grp->nr_tasks <= 1) + return; + + spin_lock(&numa_grp->lock); + + if (time_after(jiffies, + (unsigned long)(numa_grp->node_stamp + msecs_to_jiffies(100)))) { + numa_grp->node_stamp = jiffies; + spin_unlock(&numa_grp->lock); + task_work_add(curr, &curr->rship->node_work, TWA_RESUME); + return; + } + + spin_unlock(&numa_grp->lock); +#endif +} +#endif + __init void init_sched_fair_class(void) { #ifdef CONFIG_QOS_SCHED diff --git a/kernel/sched/relationship.c b/kernel/sched/relationship.c index a85f85794f6e..515c913aeb33 100644 --- a/kernel/sched/relationship.c +++ b/kernel/sched/relationship.c @@ -422,6 +422,10 @@ int sched_relationship_fork(struct task_struct *p)
spin_lock_init(&p->rship->net_lock); init_task_work(&p->rship->cb.twork, task_net_relationship_work); +#ifdef CONFIG_NUMA_BALANCING + p->rship->node_work.next = &p->rship->node_work; + init_task_work(&p->rship->node_work, task_preferred_node_work); +#endif return 0; }