Introduce NUMA isolation and consolidation.
Guan Jing (1): sched: Add can_migrate_task hook
Hui Tang (10): sched: Introduce CONFIG_TASK_PLACEMENT_BY_CPU_RANGE sched: Some fixes for select_rq hook bpf:programmable: Add nodemask operation collection sched: Introduce task relationship by net and memory bpf:programmable: Add helper to get memory and net relationship sched: Add ioctl to get relationship sched: Update numa group preferred node periodically bpf:programmable: Add helper to set preferred node sched: Introduce CONFIG_QOS_SCHED_NUMA_ICON config: Enable NUMA isolation and consolidation by default
arch/arm64/configs/openeuler_defconfig | 4 +- arch/x86/configs/openeuler_defconfig | 2 + fs/exec.c | 2 + include/linux/sched.h | 58 +++- include/linux/sched/relationship.h | 202 ++++++++++++ include/linux/sched_hook_defs.h | 4 + include/uapi/linux/bpf.h | 50 +++ include/uapi/linux/sched_ctrl.h | 57 ++++ init/Kconfig | 28 ++ init/init_task.c | 3 + kernel/fork.c | 13 + kernel/sched/Makefile | 2 + kernel/sched/bpf_sched.c | 149 +++++++++ kernel/sched/core.c | 7 + kernel/sched/debug.c | 2 + kernel/sched/fair.c | 302 +++++++++++++++-- kernel/sched/numa_icon.c | 144 ++++++++ kernel/sched/numa_icon.h | 43 +++ kernel/sched/relationship.c | 436 +++++++++++++++++++++++++ kernel/sched/relationship_ioctl.c | 142 ++++++++ kernel/sched/sched.h | 2 + scripts/bpf_helpers_doc.py | 12 + tools/include/uapi/linux/bpf.h | 50 +++ tools/lib/bpf/libbpf_sched.h | 160 ++++++++- 24 files changed, 1848 insertions(+), 26 deletions(-) create mode 100644 include/linux/sched/relationship.h create mode 100644 include/uapi/linux/sched_ctrl.h create mode 100644 kernel/sched/numa_icon.c create mode 100644 kernel/sched/numa_icon.h create mode 100644 kernel/sched/relationship.c create mode 100644 kernel/sched/relationship_ioctl.c
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9GZAQ CVE: NA
--------------------------------
Support custom cpu range for 'select_task_rq_fair' and 'can_migrate_task' without modifying 'cpus_ptr'.
Signed-off-by: Hui Tang tanghui20@huawei.com --- include/linux/sched.h | 6 +++++- init/Kconfig | 5 +++++ kernel/sched/fair.c | 20 ++++++++++---------- 3 files changed, 20 insertions(+), 11 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h index b4ab407cab37..0981c127f261 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1437,11 +1437,15 @@ struct task_struct { KABI_USE(7, void *pf_io_worker) #if defined(CONFIG_QOS_SCHED_DYNAMIC_AFFINITY) && !defined(__GENKSYMS__) KABI_USE(8, cpumask_t *prefer_cpus) - KABI_USE(9, const cpumask_t *select_cpus) #else KABI_RESERVE(8) +#endif +#if defined(CONFIG_TASK_PLACEMENT_BY_CPU_RANGE) && !defined(__GENKSYMS__) + KABI_USE(9, const cpumask_t *select_cpus) +#else KABI_RESERVE(9) #endif + #if (defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)) && defined(CONFIG_X86) KABI_USE(10, unsigned int sequential_io) KABI_USE(11, unsigned int sequential_io_avg) diff --git a/init/Kconfig b/init/Kconfig index d6422dc138d8..325c02d4a4df 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1066,10 +1066,14 @@ config RT_GROUP_SCHED
endif #CGROUP_SCHED
+config TASK_PLACEMENT_BY_CPU_RANGE + bool "variable cpu range for task placement" + config QOS_SCHED_DYNAMIC_AFFINITY bool "qos dynamic affinity" depends on CPUSETS depends on FAIR_CGROUP_SCHED + select TASK_PLACEMENT_BY_CPU_RANGE default n help This feature lets you allocate preferred cpus to taskgroup. If enabled, @@ -1839,6 +1843,7 @@ config BPF_SCHED bool "SCHED Instrumentation with BPF" depends on BPF_EVENTS depends on BPF_SYSCALL + select TASK_PLACEMENT_BY_CPU_RANGE help Enables instrumentation of the sched hooks with eBPF programs for implementing dynamic scheduling policies. diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 31db0dc456b1..55b607564bd9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6838,7 +6838,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this return cpumask_first(sched_group_span(group));
/* Traverse only the allowed CPUs */ -#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE for_each_cpu_and(i, sched_group_span(group), p->select_cpus) { #else for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { @@ -6889,7 +6889,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p { int new_cpu = cpu;
-#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE if (!cpumask_intersects(sched_domain_span(sd), p->select_cpus)) #else if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr)) @@ -7020,7 +7020,7 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu if (!available_idle_cpu(cpu)) { idle = false; if (*idle_cpu == -1) { -#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->select_cpus)) { #else if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) { @@ -7080,7 +7080,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t if (!this_sd) return -1;
-#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE cpumask_and(cpus, sched_domain_span(sd), p->select_cpus); #else cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); @@ -7248,7 +7248,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) lockdep_assert_irqs_disabled();
if ((available_idle_cpu(target) || sched_idle_cpu(target)) && -#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE cpumask_test_cpu(target, p->select_cpus) && #endif asym_fits_capacity(task_util, target)) { @@ -7261,7 +7261,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ if (prev != target && cpus_share_cache(prev, target) && (available_idle_cpu(prev) || sched_idle_cpu(prev)) && -#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE cpumask_test_cpu(prev, p->select_cpus) && #endif asym_fits_capacity(task_util, prev)) { @@ -7297,7 +7297,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && -#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE cpumask_test_cpu(p->recent_used_cpu, p->select_cpus) && #else cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && @@ -7928,7 +7928,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f new_cpu = prev_cpu; }
-#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->select_cpus); #else want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr); @@ -7969,7 +7969,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f */ if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { -#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE new_cpu = cpu; if (cpu != prev_cpu && cpumask_test_cpu(prev_cpu, p->select_cpus)) @@ -10845,7 +10845,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) int local_group;
/* Skip over this group if it has no CPUs allowed */ -#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE if (!cpumask_intersects(sched_group_span(group), p->select_cpus)) #else
From: Guan Jing guanjing6@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9GZAQ CVE: NA
--------------------------------
Add bpf hook for 'can_migrate_task', which allow user to decide whether exit or not load_balance early.
Signed-off-by: Guan Jing guanjing6@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com --- include/linux/sched.h | 10 ++++++++++ include/linux/sched_hook_defs.h | 2 ++ kernel/sched/fair.c | 14 ++++++++++++++ scripts/bpf_helpers_doc.py | 2 ++ 4 files changed, 28 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 0981c127f261..7ef4efd8cddb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2406,5 +2406,15 @@ struct sched_affine_ctx { KABI_RESERVE(3) KABI_RESERVE(4) }; + +struct sched_migrate_node { + int src_cpu; + int dst_cpu; + + KABI_RESERVE(1) + KABI_RESERVE(2) + KABI_RESERVE(3) + KABI_RESERVE(4) +}; #endif #endif diff --git a/include/linux/sched_hook_defs.h b/include/linux/sched_hook_defs.h index 818b1244a018..e2519a00aa6b 100644 --- a/include/linux/sched_hook_defs.h +++ b/include/linux/sched_hook_defs.h @@ -10,3 +10,5 @@ BPF_SCHED_HOOK(void, (void) 0, cfs_dequeue_task, struct rq *rq, struct task_stru BPF_SCHED_HOOK(int, -1, cfs_select_rq, struct sched_migrate_ctx *ctx) BPF_SCHED_HOOK(int, -1, cfs_wake_affine, struct sched_affine_ctx *ctx) BPF_SCHED_HOOK(int, -1, cfs_select_rq_exit, struct sched_migrate_ctx *ctx) +BPF_SCHED_HOOK(int, -1, cfs_can_migrate_task, struct task_struct *p, + struct sched_migrate_node *migrate_node) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 55b607564bd9..073c0cf35d3a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9486,9 +9486,23 @@ static int can_migrate_task(struct task_struct *p, struct lb_env *env) { int tsk_cache_hot; +#ifdef CONFIG_BPF_SCHED + struct sched_migrate_node migrate_node; + int ret; +#endif
lockdep_assert_rq_held(env->src_rq);
+#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + migrate_node.src_cpu = env->src_cpu; + migrate_node.dst_cpu = env->dst_cpu; + ret = bpf_sched_cfs_can_migrate_task(p, &migrate_node); + if (ret > 0) + return ret - 1; + } +#endif + /* * We do not migrate tasks that are: * 1) throttled_lb_pair, or diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index fc51d6f0d447..6001bcf66d5e 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -444,6 +444,7 @@ class PrinterHelpers(Printer): 'struct cpumask_op_args', 'struct sched_migrate_ctx', 'struct sched_affine_ctx', + 'struct sched_migrate_node', ] known_types = { '...', @@ -496,6 +497,7 @@ class PrinterHelpers(Printer): 'struct cpumask_op_args', 'struct sched_migrate_ctx', 'struct sched_affine_ctx', + 'struct sched_migrate_node', } mapped_types = { 'u8': '__u8',
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9GZAQ CVE: NA
--------------------------------
'select_idle_mask' will be polluted in 'select_idle_sibling', so alloc per_cpu memory for ctx.select_idle_mask.
Signed-off-by: Hui Tang tanghui20@huawei.com --- kernel/sched/core.c | 7 +++++++ kernel/sched/fair.c | 26 ++++++++++++++------------ 2 files changed, 21 insertions(+), 12 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 76fbb7657c8f..8a4478fc4123 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8232,6 +8232,9 @@ static struct kmem_cache *task_group_cache __read_mostly;
DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); +#ifdef CONFIG_BPF_SCHED +DECLARE_PER_CPU(cpumask_var_t, select_cpu_mask); +#endif
void __init sched_init(void) { @@ -8285,6 +8288,10 @@ void __init sched_init(void) cpumask_size(), GFP_KERNEL, cpu_to_node(i)); per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node( cpumask_size(), GFP_KERNEL, cpu_to_node(i)); +#ifdef CONFIG_BPF_SCHED + per_cpu(select_cpu_mask, i) = (cpumask_var_t)kzalloc_node( + cpumask_size(), GFP_KERNEL, cpu_to_node(i)); +#endif } #endif /* CONFIG_CPUMASK_OFFSTACK */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 073c0cf35d3a..20f971b7df19 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6578,6 +6578,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* Working cpumask for: load_balance, load_balance_newidle. */ DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); +#ifdef CONFIG_BPF_SCHED +DEFINE_PER_CPU(cpumask_var_t, select_cpu_mask); +#endif
#ifdef CONFIG_NO_HZ_COMMON
@@ -7897,7 +7900,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); #ifdef CONFIG_BPF_SCHED struct sched_migrate_ctx ctx; - cpumask_t *cpus_prev = NULL; cpumask_t *cpus; int ret; #endif @@ -7912,8 +7914,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f */ lockdep_assert_held(&p->pi_lock);
-#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +#ifdef CONFIG_TASK_PLACEMENT_BY_CPU_RANGE p->select_cpus = p->cpus_ptr; +#endif + +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY if (dynamic_affinity_used() || smart_grid_used()) set_task_select_cpus(p, &idlest_cpu, sd_flag); #endif @@ -7945,18 +7950,18 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f ctx.wake_flags = wake_flags; ctx.want_affine = want_affine; ctx.sd_flag = sd_flag; - ctx.select_idle_mask = this_cpu_cpumask_var_ptr(select_idle_mask); + ctx.select_idle_mask = + this_cpu_cpumask_var_ptr(select_cpu_mask);
ret = bpf_sched_cfs_select_rq(&ctx); if (ret >= 0) { rcu_read_unlock(); return ret; } else if (ret != -1) { - cpus = this_cpu_cpumask_var_ptr(select_idle_mask); - if (cpumask_subset(cpus, p->cpus_ptr) && + cpus = this_cpu_cpumask_var_ptr(select_cpu_mask); + if (cpumask_subset(cpus, p->select_cpus) && !cpumask_empty(cpus)) { - cpus_prev = (void *)p->cpus_ptr; - p->cpus_ptr = cpus; + p->select_cpus = cpus; } } } @@ -8004,11 +8009,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (bpf_sched_enabled()) { ctx.new_cpu = new_cpu; ret = bpf_sched_cfs_select_rq_exit(&ctx); - if (ret >= 0) - new_cpu = ret; - - if (cpus_prev) - p->cpus_ptr = cpus_prev; + if (ret > 0 && ret <= nr_cpu_ids) + new_cpu = ret - 1; } #endif
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9GZAQ CVE: NA
--------------------------------
Introduce the helper to process a series nodemask operations, such as 'nodes_empty', 'node_isset' etc.
Signed-off-by: Hui Tang tanghui20@huawei.com --- include/linux/sched.h | 22 +++++++ include/uapi/linux/bpf.h | 29 +++++++++ kernel/sched/bpf_sched.c | 71 ++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 2 + tools/include/uapi/linux/bpf.h | 29 +++++++++ tools/lib/bpf/libbpf_sched.h | 107 +++++++++++++++++++++++++++++++++ 6 files changed, 260 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 7ef4efd8cddb..3d5553f70401 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2378,6 +2378,28 @@ enum cpumask_op_type { CPUMASK_CPULIST_PARSE };
+enum nodemask_op_type { + NODEMASK_EMPTY, + NODEMASK_NODE_ISSET, + NODEMASK_NODES_CLEAR, + NODEMASK_NODE_SET, + NODEMASK_NODE_CLEAR, + NODEMASK_NODELIST_PARSE, + NODEMASK_TO_CPUMASK, + NODEMASK_NODES_ANDNOT, + NODEMASK_NODES_AND, + NODEMASK_NODES_OR, + NODEMASK_WEIGHT, + NODEMASK_ONLINE +}; + +struct nodemask_op_args { + enum nodemask_op_type op_type; + void *arg1; + void *arg2; + void *arg3; +}; + struct sched_migrate_ctx { struct task_struct *task; struct cpumask *select_idle_mask; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2b11202c3439..e608b32d4c5a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3874,6 +3874,34 @@ union bpf_attr { * check src_cpu whether share cache with dst_cpu. * Return * yes 1, no 0. + * + * int bpf_nodemask_op(struct nodemask_op_args *op, int len) + * Description + * A series of nodemask-related operations. Perform different + * operations base on *op*->type. User also need fill other + * *op* field base on *op*->type. *op*->type is one of them + * + * **NODEMASK_EMPTY** + * nodes_empty(op->arg1) returned. + * **NODEMASK_NODE_ISSET** + * node_isset(op->arg1, op->arg2) returned + * **NODEMASK_NODES_CLEAR** + * 0 returned + * **NODEMASK_NODE_CLEAR** + * unset op->arg1 from op->arg2, 0 returned + * **NODEMASK_NODE_SET** + * set op->arg1 to op->arg2, 0 returned + * **NODEMASK_WEIGHT** + * nodes_weight(op->arg1) returned + * **NODEMASK_NODELIST_PARSE** + * str *op->arg1* to nodemask_t *op->arg2*, + * 0 on success, or a negative error in case of failure. + * **NODEMASK_TO_CPUMASK** + * nodemask_t *arg1* to cpumask_t *op->arg2*, 0 returned. + * **NODEMASK_ONLINE** + * set online nodes to nodemask_t *op->arg1*, 0 returned. + * Return + * View above. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4046,6 +4074,7 @@ union bpf_attr { FN(sched_entity_to_tg), \ FN(cpumask_op), \ FN(cpus_share_cache), \ + FN(nodemask_op), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 220ba83fc5f4..a1e72533d5b6 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -260,6 +260,75 @@ static const struct bpf_func_proto bpf_cpumask_op_proto = { .arg2_type = ARG_CONST_SIZE, };
+BPF_CALL_2(bpf_nodemask_op, struct nodemask_op_args *, op, int, len) +{ + struct cpumask *cpumask; + nodemask_t mask; + int nid; + + if (len != sizeof(*op) || !op->arg1) + return -EINVAL; + + switch (op->op_type) { + case NODEMASK_EMPTY: + mask = *(nodemask_t *)op->arg1; + return nodes_empty(mask); + case NODEMASK_NODE_ISSET: + mask = *(nodemask_t *)op->arg2; + return node_isset(*(int *)op->arg1, mask); + case NODEMASK_NODES_CLEAR: + __nodes_clear((nodemask_t *)op->arg1, MAX_NUMNODES); + break; + case NODEMASK_NODE_CLEAR: + __node_clear(*(int *)op->arg1, (nodemask_t *)op->arg2); + break; + case NODEMASK_NODE_SET: + __node_set(*(int *)op->arg1, (nodemask_t *)op->arg2); + break; + case NODEMASK_NODES_AND: + __nodes_and((nodemask_t *)op->arg1, (nodemask_t *)op->arg2, + (nodemask_t *)op->arg3, MAX_NUMNODES); + break; + case NODEMASK_NODES_ANDNOT: + __nodes_andnot((nodemask_t *)op->arg1, (nodemask_t *)op->arg2, + (nodemask_t *)op->arg3, MAX_NUMNODES); + break; + case NODEMASK_NODES_OR: + __nodes_or((nodemask_t *)op->arg1, (nodemask_t *)op->arg2, + (nodemask_t *)op->arg3, MAX_NUMNODES); + break; + case NODEMASK_WEIGHT: + mask = *(nodemask_t *)op->arg1; + return nodes_weight(mask); + case NODEMASK_NODELIST_PARSE: + return __nodelist_parse((const char *)op->arg1, + (nodemask_t *)op->arg2, MAX_NUMNODES); + case NODEMASK_TO_CPUMASK: + mask = *(nodemask_t *)op->arg1; + cpumask = (struct cpumask *)op->arg2; + cpumask_clear(cpumask); + for_each_node_mask(nid, mask) { + cpumask_or(cpumask, cpumask, cpumask_of_node(nid)); + } + break; + case NODEMASK_ONLINE: + *(nodemask_t *)op->arg1 = node_online_map; + break; + default: + return -EINVAL; + } + + return 0; +} + +static const struct bpf_func_proto bpf_nodemask_op_proto = { + .func = bpf_nodemask_op, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, +}; + BPF_CALL_2(bpf_cpus_share_cache, int, src_cpu, int, dst_cpu) { if ((unsigned int)src_cpu >= nr_cpu_ids || @@ -299,6 +368,8 @@ bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_cpumask_op_proto; case BPF_FUNC_cpus_share_cache: return &bpf_cpus_share_cache_proto; + case BPF_FUNC_nodemask_op: + return &bpf_nodemask_op_proto; default: return bpf_base_func_proto(func_id); } diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 6001bcf66d5e..929a3031b4d6 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -445,6 +445,7 @@ class PrinterHelpers(Printer): 'struct sched_migrate_ctx', 'struct sched_affine_ctx', 'struct sched_migrate_node', + 'struct nodemask_op_args', ] known_types = { '...', @@ -498,6 +499,7 @@ class PrinterHelpers(Printer): 'struct sched_migrate_ctx', 'struct sched_affine_ctx', 'struct sched_migrate_node', + 'struct nodemask_op_args', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e241f8d4becd..809723603ba8 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3874,6 +3874,34 @@ union bpf_attr { * check src_cpu whether share cache with dst_cpu. * Return * true yes, false no. + * + * int bpf_nodemask_op(struct nodemask_op_args *op, int len) + * Description + * A series of nodemask-related operations. Perform different + * operations base on *op*->type. User also need fill other + * *op* field base on *op*->type. *op*->type is one of them + * + * **NODEMASK_EMPTY** + * nodes_empty(op->arg1) returned. + * **NODEMASK_NODE_ISSET** + * node_isset(op->arg1, op->arg2) returned + * **NODEMASK_NODES_CLEAR** + * 0 returned + * **NODEMASK_NODE_CLEAR** + * unset op->arg1 from op->arg2, 0 returned + * **NODEMASK_NODE_SET** + * set op->arg1 to op->arg2, 0 returned + * **NODEMASK_WEIGHT** + * nodes_weight(op->arg1) returned + * **NODEMASK_NODELIST_PARSE** + * str *op->arg1* to nodemask_t *op->arg2*, + * 0 on success, or a negative error in case of failure. + * **NODEMASK_TO_CPUMASK** + * nodemask_t *arg1* to cpumask_t *op->arg2*, 0 returned. + * **NODEMASK_ONLINE** + * set online nodes to nodemask_t *op->arg1*, 0 returned. + * Return + * View above. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4046,6 +4074,7 @@ union bpf_attr { FN(sched_entity_to_tg), \ FN(cpumask_op), \ FN(cpus_share_cache), \ + FN(nodemask_op), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/tools/lib/bpf/libbpf_sched.h b/tools/lib/bpf/libbpf_sched.h index 04b43c145fcd..7fa5f03e6ba4 100644 --- a/tools/lib/bpf/libbpf_sched.h +++ b/tools/lib/bpf/libbpf_sched.h @@ -16,6 +16,7 @@ #define __LIBBPF_LIBSCHED_H
#include <linux/bpf_topology.h> +#include <linux/numa.h> #include <linux/version.h> #include <uapi/linux/bpf.h> #include <bpf/bpf_helpers.h> @@ -78,6 +79,112 @@ struct { __uint(max_entries, 1); } map_cpumask_info SEC(".maps");
+static __always_inline void +libbpf_nodes_and(nodemask_t *dst, nodemask_t *src1, nodemask_t *src2) +{ + struct nodemask_op_args op = {0}; + + op.op_type = NODEMASK_NODES_AND; + op.arg1 = dst; + op.arg2 = src1; + op.arg3 = src2; + bpf_nodemask_op(&op, sizeof(op)); +} + +static __always_inline void +libbpf_nodes_andnot(nodemask_t *dst, nodemask_t *src1, nodemask_t *src2) +{ + struct nodemask_op_args op = {0}; + + op.op_type = NODEMASK_NODES_ANDNOT; + op.arg1 = dst; + op.arg2 = src1; + op.arg3 = src2; + bpf_nodemask_op(&op, sizeof(op)); +} + +static __always_inline void +libbpf_nodes_or(nodemask_t *dst, nodemask_t *src1, nodemask_t *src2) +{ + struct nodemask_op_args op = {0}; + + op.op_type = NODEMASK_NODES_OR; + op.arg1 = dst; + op.arg2 = src1; + op.arg3 = src2; + bpf_nodemask_op(&op, sizeof(op)); +} + +static __always_inline void libbpf_node_set(int nid, + nodemask_t *nodes) +{ + struct nodemask_op_args op = {0}; + + op.op_type = NODEMASK_NODE_SET; + op.arg1 = &nid; + op.arg2 = nodes; + op.arg3 = INVALID_PTR; + bpf_nodemask_op(&op, sizeof(op)); +} + +static __always_inline void libbpf_node_clear(int nid, + nodemask_t *nodes) +{ + struct nodemask_op_args op = {0}; + + op.op_type = NODEMASK_NODE_CLEAR; + op.arg1 = &nid; + op.arg2 = nodes; + op.arg3 = INVALID_PTR; + bpf_nodemask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_node_isset(int nid, + nodemask_t *nodes) +{ + struct nodemask_op_args op = {0}; + + op.op_type = NODEMASK_NODE_ISSET; + op.arg1 = &nid; + op.arg2 = nodes; + op.arg3 = INVALID_PTR; + return bpf_nodemask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_nodemask_empty(nodemask_t *nodes) +{ + struct nodemask_op_args op = {0}; + + op.op_type = NODEMASK_EMPTY; + op.arg1 = nodes; + op.arg2 = INVALID_PTR; + op.arg3 = INVALID_PTR; + return bpf_nodemask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_nodemask_to_cpumask(nodemask_t *nodes, + struct cpumask *cpus) +{ + struct nodemask_op_args op = {0}; + + op.op_type = NODEMASK_TO_CPUMASK; + op.arg1 = nodes; + op.arg2 = cpus; + op.arg3 = INVALID_PTR; + return bpf_nodemask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_nodes_online(nodemask_t *nodes) +{ + struct nodemask_op_args op = {0}; + + op.op_type = NODEMASK_ONLINE; + op.arg1 = nodes; + op.arg2 = INVALID_PTR; + op.arg3 = INVALID_PTR; + return bpf_nodemask_op(&op, sizeof(op)); +} + static __always_inline long libbpf_cpumask_copy(struct cpumask *dst, struct cpumask *src) {
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9GZAQ CVE: NA
--------------------------------
There may be some relationships between threads, such as network communication, memory sharing, etc.
Generally, threads that have relationships may have better performance when they are scheduled to the same smt, cluster or numa, because they share certain resources.
This patch is in the scheduler, and provides a mechanism to identify and maintain the affinity relationship. Currently, the memory and network parts have been implemented, and other relationships will be extended in the future.
Signed-off-by: Hui Tang tanghui20@huawei.com --- fs/exec.c | 2 + include/linux/sched.h | 5 + include/linux/sched/relationship.h | 137 +++++++++++ init/Kconfig | 10 + init/init_task.c | 3 + kernel/fork.c | 13 + kernel/sched/Makefile | 1 + kernel/sched/debug.c | 2 + kernel/sched/fair.c | 80 +++++- kernel/sched/relationship.c | 379 +++++++++++++++++++++++++++++ 10 files changed, 631 insertions(+), 1 deletion(-) create mode 100644 include/linux/sched/relationship.h create mode 100644 kernel/sched/relationship.c
diff --git a/fs/exec.c b/fs/exec.c index 981b3ac90c44..792d62632e92 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -38,6 +38,7 @@ #include <linux/sched/coredump.h> #include <linux/sched/signal.h> #include <linux/sched/numa_balancing.h> +#include <linux/sched/relationship.h> #include <linux/sched/task.h> #include <linux/pagemap.h> #include <linux/perf_event.h> @@ -1822,6 +1823,7 @@ static int bprm_execve(struct linux_binprm *bprm, rseq_execve(current); acct_update_integrals(current); task_numa_free(current, false); + task_relationship_free(current, true); return retval;
out: diff --git a/include/linux/sched.h b/include/linux/sched.h index 3d5553f70401..af43d8d55e1b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -26,6 +26,7 @@ #include <linux/resource.h> #include <linux/latencytop.h> #include <linux/sched/prio.h> +#include <linux/sched/relationship.h> #include <linux/sched/types.h> #include <linux/signal_types.h> #include <linux/mm_types_task.h> @@ -1468,7 +1469,11 @@ struct task_struct { #else KABI_RESERVE(13) #endif +#if defined(CONFIG_SCHED_TASK_RELATIONSHIP) && !defined(__GENKSYMS__) + KABI_USE(14, struct task_relationship *rship) +#else KABI_RESERVE(14) +#endif KABI_RESERVE(15) KABI_RESERVE(16) KABI_AUX_PTR(task_struct) diff --git a/include/linux/sched/relationship.h b/include/linux/sched/relationship.h new file mode 100644 index 000000000000..df3f3f7814cd --- /dev/null +++ b/include/linux/sched/relationship.h @@ -0,0 +1,137 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_RELATIONSHIP_H +#define _LINUX_SCHED_RELATIONSHIP_H + +#include <linux/nodemask.h> +#include <linux/jump_label.h> +#include <linux/refcount.h> + +#define FAULT_NODES_MAX 4 + +struct task_struct; +struct rq; + +#ifdef CONFIG_SCHED_DEBUG +struct seq_file; +#endif + +struct fault_array_info { + int nid; + unsigned long val; +}; + +struct relationship_hdr { + refcount_t refcount; + spinlock_t lock; + int nr_tasks; + int gid; + nodemask_t preferred_nid; +}; + +enum net_req_type { + NET_RS_TYPE_INVALID = 0, + NET_RS_TYPE_LOCAL, + NET_RS_TYPE_RX, + NET_RS_TYPE_TX, + NET_RS_TYPE_MAX +}; + +struct net_relationship_req { + enum net_req_type net_rship_type; + pid_t rx_pid; + pid_t tx_pid; + int nic_nid; + int rx_dev_idx; + int rx_dev_queue_idx; + u64 rx_dev_netns_cookie; + unsigned long rxtx_bytes; + + /* reserved */ + unsigned long rxtx_cnt; +}; + +struct net_relationship_callback { + struct callback_head twork; + atomic_t active; + pid_t src_pid; + struct net_relationship_req req; +}; + +struct net_group { + struct rcu_head rcu; + struct relationship_hdr hdr; + unsigned long rxtx_bytes; + + /* reserved */ + unsigned long rxtx_cnt; +}; + +struct numa_fault_ext { + struct fault_array_info faults_ordered[FAULT_NODES_MAX]; +}; + +struct task_relationship { + /* network relationship */ + struct net_group __rcu *net_group; + spinlock_t net_lock; + int nic_nid; + int rx_dev_idx; + int rx_dev_queue_idx; + unsigned long rx_dev_netns_cookie; + unsigned long rxtx_remote_bytes; + unsigned long rxtx_remote_update_next; + unsigned long rxtx_remote_buffer; + unsigned long rxtx_bytes; + unsigned long rxtx_buffer; + unsigned long rxtx_update_next; + struct net_relationship_callback cb; + + /* extras numa fault data */ + struct numa_fault_ext faults; +}; + +extern void task_relationship_enable(void); +extern void task_relationship_disable(void); + +#ifdef CONFIG_SCHED_DEBUG +extern void sched_show_relationship(struct task_struct *p, struct seq_file *m); +#endif + +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP +extern int sched_relationship_fork(struct task_struct *p); +extern void sched_relationship_free(struct task_struct *p); +void task_relationship_free(struct task_struct *tsk, bool reset); +extern bool task_relationship_supported(struct task_struct *tsk); +extern int sched_net_relationship_submit(struct net_relationship_req *req); +extern void numa_faults_update_and_sort(int nid, int new, + struct fault_array_info *stats); + +DECLARE_STATIC_KEY_FALSE(__relationship_switch); +static inline bool task_relationship_used(void) +{ + return static_branch_unlikely(&__relationship_switch); +} +#else +static inline bool task_relationship_used(void) +{ + return false; +} + +static inline int sched_relationship_fork(struct task_struct *p) +{ + return 0; +} + +static inline void sched_relationship_free(struct task_struct *p) {} + +static inline void +task_relationship_free(struct task_struct *tsk, bool reset) {} + +static inline int +sched_net_relationship_submit(struct net_relationship_req *req) +{ + return 0; +} +#endif + +#endif diff --git a/init/Kconfig b/init/Kconfig index 325c02d4a4df..ea9a6e93155b 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1081,6 +1081,16 @@ config QOS_SCHED_DYNAMIC_AFFINITY of taskgroup is below threshold setted, otherwise make taskgroup to use cpus allowed.
+config SCHED_TASK_RELATIONSHIP + bool "task relationship" + depends on NUMA_BALANCING + default n + help + This feature enables the scheduler to identify tasks relationship by + page fault, SPE, socket and other IPC method. + + If in doubt, say N. + config UCLAMP_TASK_GROUP bool "Utilization clamping per group of tasks" depends on CGROUP_SCHED diff --git a/init/init_task.c b/init/init_task.c index fa8838c2c203..3b846f8223d9 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -217,6 +217,9 @@ struct task_struct init_task #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY .prefer_cpus = NULL, #endif +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + .rship = NULL, +#endif #ifdef CONFIG_SECCOMP_FILTER .seccomp = { .filter_count = ATOMIC_INIT(0) }, #endif diff --git a/kernel/fork.c b/kernel/fork.c index 079b718131b0..12db99751381 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -476,6 +476,8 @@ void free_task(struct task_struct *tsk) #ifdef CONFIG_QOS_SCHED_SMART_GRID sched_grid_qos_free(tsk); #endif + if (task_relationship_used()) + sched_relationship_free(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -748,6 +750,7 @@ void __put_task_struct(struct task_struct *tsk) io_uring_free(tsk); cgroup_free(tsk); task_numa_free(tsk, true); + task_relationship_free(tsk, false); security_task_free(tsk); exit_creds(tsk); delayacct_tsk_free(tsk); @@ -949,6 +952,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->prefer_cpus = NULL; #endif
+#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + tsk->rship = NULL; +#endif + setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); @@ -2102,6 +2109,12 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cleanup_count; #endif
+ if (task_relationship_used()) { + retval = sched_relationship_fork(p); + if (retval) + goto bad_fork_cleanup_count; + } + /* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index a6fe0ee09917..114dc36320c6 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -40,3 +40,4 @@ obj-$(CONFIG_SCHED_CORE) += core_sched.o obj-$(CONFIG_BPF_SCHED) += bpf_sched.o obj-$(CONFIG_BPF_SCHED) += bpf_topology.o obj-$(CONFIG_QOS_SCHED_SMART_GRID) += grid/ +obj-$(CONFIG_SCHED_TASK_RELATIONSHIP) += relationship.o diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 00f01518bbdd..5233ba9fdc69 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1040,6 +1040,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, }
sched_show_numa(p, m); + + sched_show_relationship(p, m); }
void proc_sched_set_task(struct task_struct *p) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 20f971b7df19..f9aa00ec559e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1086,6 +1086,11 @@ struct numa_group { struct rcu_head rcu; unsigned long total_faults; unsigned long max_faults_cpu; +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + struct fault_array_info score_ordered[FAULT_NODES_MAX]; + struct fault_array_info faults_ordered[FAULT_NODES_MAX]; + nodemask_t preferred_nid; +#endif /* * Faults_cpu is used to decide whether memory should move * towards the CPU. As a consequence, these stats are weighted @@ -2279,6 +2284,9 @@ static int preferred_group_nid(struct task_struct *p, int nid) { nodemask_t nodes; int dist; +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + struct numa_group *ng; +#endif
/* Direct connections between all NUMA nodes. */ if (sched_numa_topology_type == NUMA_DIRECT) @@ -2301,7 +2309,19 @@ static int preferred_group_nid(struct task_struct *p, int nid) max_score = score; max_node = node; } +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + if (task_relationship_used()) { + ng = deref_curr_numa_group(p); + if (ng) { + spin_lock_irq(&ng->lock); + numa_faults_update_and_sort(node, score, + ng->score_ordered); + spin_unlock_irq(&ng->lock); + } + } +#endif } + return max_node; }
@@ -2451,6 +2471,17 @@ static void task_numa_placement(struct task_struct *p) max_faults = group_faults; max_nid = nid; } + +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + if (task_relationship_used()) { + numa_faults_update_and_sort(nid, faults, + p->rship->faults.faults_ordered); + + if (ng) + numa_faults_update_and_sort(nid, group_faults, + ng->faults_ordered); + } +#endif }
if (ng) { @@ -2512,6 +2543,16 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
grp->nr_tasks++; rcu_assign_pointer(p->numa_group, grp); + +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + if (task_relationship_used()) { + grp->preferred_nid = NODE_MASK_NONE; + for (i = 0; i < FAULT_NODES_MAX; i++) { + grp->faults_ordered[i].nid = -1; + grp->score_ordered[i].nid = -1; + } + } +#endif }
rcu_read_lock(); @@ -2623,6 +2664,15 @@ void task_numa_free(struct task_struct *p, bool final) p->total_numa_faults = 0; for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) numa_faults[i] = 0; + +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + if (task_relationship_used()) { + for (i = 0; i < FAULT_NODES_MAX; i++) { + p->rship->faults.faults_ordered[i].nid = -1; + p->rship->faults.faults_ordered[i].val = 0; + } + } +#endif } }
@@ -13707,7 +13757,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) struct numa_group *ng;
rcu_read_lock(); - ng = rcu_dereference(p->numa_group); + for_each_online_node(node) { if (p->numa_faults) { tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)]; @@ -13722,6 +13772,34 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) rcu_read_unlock(); } #endif /* CONFIG_NUMA_BALANCING */ + +void sched_show_relationship(struct task_struct *p, struct seq_file *m) +{ +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + struct net_group *net_grp; + struct numa_group *ng; + + if (!task_relationship_used()) + return; + + rcu_read_lock(); + + ng = rcu_dereference(p->numa_group); + if (ng) { + seq_printf(m, "numa group preferred nid %*pbl\n", + nodemask_pr_args(&ng->preferred_nid)); + } + + net_grp = rcu_dereference(p->rship->net_group); + if (net_grp) { + seq_printf(m, "net group gid %d preferred nid %*pbl\n", + net_grp->hdr.gid, + nodemask_pr_args(&net_grp->hdr.preferred_nid)); + } + + rcu_read_unlock(); +#endif +} #endif /* CONFIG_SCHED_DEBUG */
__init void init_sched_fair_class(void) diff --git a/kernel/sched/relationship.c b/kernel/sched/relationship.c new file mode 100644 index 000000000000..01879e3272de --- /dev/null +++ b/kernel/sched/relationship.c @@ -0,0 +1,379 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Common code for task relationship aware + * + * Copyright (C) 2023-2024 Huawei Technologies Co., Ltd + * + * Author: Hui Tang tanghui20@huawei.com + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#include <linux/bpf_sched.h> +#include <linux/sort.h> + +#include "sched.h" + +#define RXTX_BYTES_PERIOD_MS (1000) +#define RXTX_BYTES_DECAY_RATIO (2) + +DEFINE_STATIC_KEY_FALSE(__relationship_switch); + +void task_relationship_enable(void) +{ + static_branch_enable(&__relationship_switch); +} + +void task_relationship_disable(void) +{ + static_branch_disable(&__relationship_switch); +} + +bool task_relationship_supported(struct task_struct *tsk) +{ + if (!task_relationship_used()) + return false; + + if (!tsk->rship || !tsk->mm || + !cpumask_subset(cpu_online_mask, tsk->cpus_ptr) || + !nodes_subset(node_online_map, tsk->mems_allowed) || + get_task_policy(tsk)->mode == MPOL_BIND || + get_task_policy(tsk)->mode == MPOL_INTERLEAVE) + return false; + + return true; +} + +static inline int get_net_group(struct net_group *grp) +{ + return refcount_inc_not_zero(&grp->hdr.refcount); +} + +static inline void put_net_group(struct net_group *grp) +{ + if (refcount_dec_and_test(&grp->hdr.refcount)) + kfree_rcu(grp, rcu); +} + +static inline void put_task_net_group(struct task_struct *tsk, bool reset) +{ + struct net_group *grp; + unsigned long flags; + + spin_lock_irqsave(&tsk->rship->net_lock, flags); + + grp = rcu_dereference_protected(tsk->rship->net_group, + lockdep_is_held(&tsk->rship->net_lock)); + if (grp) { + spin_lock(&grp->hdr.lock); + grp->rxtx_bytes -= tsk->rship->rxtx_bytes; + grp->hdr.nr_tasks--; + spin_unlock(&grp->hdr.lock); + put_net_group(grp); + RCU_INIT_POINTER(tsk->rship->net_group, NULL); + } + + if (reset) { + tsk->rship->rxtx_bytes = 0; + tsk->rship->rxtx_remote_bytes = 0; + tsk->rship->rx_dev_idx = -1; + tsk->rship->rx_dev_queue_idx = -1; + tsk->rship->nic_nid = -1; + tsk->rship->rx_dev_netns_cookie = 0; + } + + spin_unlock_irqrestore(&tsk->rship->net_lock, flags); +} + +static inline int remote_rxtx_process(struct net_relationship_req *req) +{ + struct task_relationship *rship; + struct task_struct *tsk; + unsigned long flags; + pid_t pid; + long diff; + + rcu_read_lock(); + + pid = req->net_rship_type == NET_RS_TYPE_RX ? req->rx_pid : req->tx_pid; + tsk = find_task_by_pid_ns(pid, &init_pid_ns); + if (!tsk || !task_relationship_supported(tsk)) + goto out_unlock; + + rship = tsk->rship; + if (time_after(jiffies, rship->rxtx_remote_update_next)) { + diff = rship->rxtx_remote_buffer - rship->rxtx_remote_bytes / 2; + + spin_lock_irqsave(&rship->net_lock, flags); + rship->nic_nid = req->nic_nid; + if (req->net_rship_type == NET_RS_TYPE_RX) { + rship->rx_dev_idx = req->rx_dev_idx; + rship->rx_dev_queue_idx = req->rx_dev_queue_idx; + rship->rx_dev_netns_cookie = req->rx_dev_netns_cookie; + } + rship->rxtx_remote_bytes += diff; + rship->rxtx_remote_buffer = 0; + spin_unlock_irqrestore(&rship->net_lock, flags); + } + + rship->rxtx_remote_buffer += req->rxtx_bytes; + +out_unlock: + rcu_read_unlock(); + + return 0; +} + +int sched_net_relationship_submit(struct net_relationship_req *req) +{ + struct task_struct *rx_tsk, *tx_tsk, *dst_tsk; + struct net_group *rx_grp, *tx_grp; + int ret; + + if (req->net_rship_type == NET_RS_TYPE_RX || + req->net_rship_type == NET_RS_TYPE_TX) + return remote_rxtx_process(req); + + rcu_read_lock(); + + rx_tsk = find_task_by_pid_ns(req->rx_pid, &init_pid_ns); + tx_tsk = find_task_by_pid_ns(req->tx_pid, &init_pid_ns); + if (!rx_tsk || !tx_tsk) { + ret = -ESRCH; + goto out_unlock; + } + + if (!task_relationship_supported(rx_tsk) || + !task_relationship_supported(tx_tsk)) { + ret = -EPERM; + goto out_unlock; + } + + if (atomic_read(&rx_tsk->rship->cb.active) && + atomic_read(&tx_tsk->rship->cb.active)) { + ret = -EBUSY; + goto out_unlock; + } + + rx_grp = rcu_dereference(rx_tsk->rship->net_group); + tx_grp = rcu_dereference(tx_tsk->rship->net_group); + if (rx_grp && tx_grp) { + dst_tsk = rx_grp->hdr.nr_tasks >= tx_grp->hdr.nr_tasks ? + rx_tsk : tx_tsk; + } else if (rx_grp) { + dst_tsk = rx_tsk; + } else if (tx_grp) { + dst_tsk = tx_tsk; + } else { + dst_tsk = !atomic_read(&rx_tsk->rship->cb.active) ? + rx_tsk : tx_tsk; + } + + if (atomic_cmpxchg(&dst_tsk->rship->cb.active, 0, 1)) { + ret = -EBUSY; + goto out_unlock; + } + + memcpy(&dst_tsk->rship->cb.req, req, sizeof(*req)); + dst_tsk->rship->cb.src_pid = dst_tsk == rx_tsk ? + req->tx_pid : req->rx_pid; + task_work_add(dst_tsk, &dst_tsk->rship->cb.twork, TWA_RESUME); + ret = 0; + +out_unlock: + rcu_read_unlock(); + return ret; +} + +static void task_net_group(struct task_struct *curr, struct task_struct *src) +{ + struct net_group *src_grp, *curr_grp, *grp; + + double_lock_irq(&src->rship->net_lock, &curr->rship->net_lock); + curr_grp = rcu_dereference_protected(curr->rship->net_group, + lockdep_is_held(&curr->rship->net_lock)); + src_grp = rcu_dereference_protected(src->rship->net_group, + lockdep_is_held(&src->rship->net_lock)); + + if (!curr_grp) { + grp = kzalloc(sizeof(*grp), GFP_ATOMIC | __GFP_NOWARN); + if (!grp) + goto out_unlock; + + refcount_set(&grp->hdr.refcount, 1); + spin_lock_init(&grp->hdr.lock); + grp->hdr.gid = curr->pid; + grp->hdr.preferred_nid = NODE_MASK_NONE; + node_set(task_node(curr), grp->hdr.preferred_nid); + grp->hdr.nr_tasks = 1; + rcu_assign_pointer(curr->rship->net_group, grp); + curr_grp = rcu_dereference_protected(curr->rship->net_group, + lockdep_is_held(&curr->rship->net_lock)); + } + + if (curr_grp == src_grp) + goto out_unlock; + + if (!get_net_group(curr_grp)) + goto out_unlock; + + spin_lock(&curr_grp->hdr.lock); + curr_grp->hdr.nr_tasks++; + curr_grp->rxtx_bytes += src->rship->rxtx_bytes; + spin_unlock(&curr_grp->hdr.lock); + + if (src_grp) { + spin_lock(&src_grp->hdr.lock); + src_grp->hdr.nr_tasks--; + src_grp->rxtx_bytes -= src->rship->rxtx_bytes; + spin_unlock(&src_grp->hdr.lock); + put_net_group(src_grp); + } + + rcu_assign_pointer(src->rship->net_group, curr_grp); +out_unlock: + spin_unlock(&src->rship->net_lock); + spin_unlock_irq(&curr->rship->net_lock); +} + +static void task_rxtx_data_update(struct task_struct *tsk) +{ + struct net_group *grp; + long bytes_diff; + + spin_lock_irq(&tsk->rship->net_lock); + bytes_diff = tsk->rship->rxtx_buffer - + tsk->rship->rxtx_bytes / RXTX_BYTES_DECAY_RATIO; + tsk->rship->rxtx_bytes += bytes_diff; + tsk->rship->rxtx_buffer = 0; + tsk->rship->rxtx_update_next = jiffies + + msecs_to_jiffies(RXTX_BYTES_PERIOD_MS); + + grp = rcu_dereference_protected(tsk->rship->net_group, + lockdep_is_held(&tsk->rship->net_lock)); + if (grp) { + spin_lock(&grp->hdr.lock); + grp->rxtx_bytes += bytes_diff; + spin_unlock(&grp->hdr.lock); + } + + spin_unlock_irq(&tsk->rship->net_lock); +} + +static void task_net_relationship_work(struct callback_head *work) +{ + struct net_relationship_callback *ncb; + struct task_struct *curr = current; + struct net_relationship_req req; + struct task_struct *src; + + ncb = container_of(work, struct net_relationship_callback, twork); + req = ncb->req; + atomic_set(&ncb->active, 0); + + rcu_read_lock(); + src = find_task_by_pid_ns(ncb->src_pid, &init_pid_ns); + if (!src) { + rcu_read_unlock(); + return; + } + + if (!task_relationship_supported(src) || + !task_relationship_supported(curr)) { + rcu_read_unlock(); + return; + } + + /* prevent src going away */ + get_task_struct(src); + + rcu_read_unlock(); + + /* build net relationship */ + task_net_group(src, curr); + + if (time_after(jiffies, curr->rship->rxtx_update_next)) + task_rxtx_data_update(curr); + + if (time_after(jiffies, src->rship->rxtx_update_next)) + task_rxtx_data_update(src); + + double_lock_irq(&src->rship->net_lock, &curr->rship->net_lock); + curr->rship->rxtx_buffer += req.rxtx_bytes; + src->rship->rxtx_buffer += req.rxtx_bytes; + spin_unlock(&src->rship->net_lock); + spin_unlock_irq(&curr->rship->net_lock); + + put_task_struct(src); +} + +static int cmp_fault_stats(const void *a, const void *b) +{ + return ((struct fault_array_info *)b)->val - + ((struct fault_array_info *)a)->val; +} + +void numa_faults_update_and_sort(int nid, int new, + struct fault_array_info *stats) +{ + int nodes, i; + + if (!task_relationship_used()) + return; + + if (nid == first_online_node) { + for (i = 0; i < FAULT_NODES_MAX; i++) { + stats[i].nid = -1; + stats[i].val = 0; + } + } + + nodes = min(FAULT_NODES_MAX, num_online_nodes()); + if (new <= stats[nodes - 1].val) + return; + + stats[nodes - 1].nid = nid; + stats[nodes - 1].val = new; + sort(stats, nodes, sizeof(stats[0]), cmp_fault_stats, NULL); +} + +void task_relationship_free(struct task_struct *tsk, bool reset) +{ + if (!task_relationship_used()) + return; + + put_task_net_group(tsk, reset); +} + +int sched_relationship_fork(struct task_struct *p) +{ + int i; + + p->rship = kzalloc(sizeof(struct task_relationship), GFP_KERNEL); + if (!p->rship) + return -ENOMEM; + + for (i = 0; i < FAULT_NODES_MAX; i++) + p->rship->faults.faults_ordered[i].nid = -1; + + p->rship->nic_nid = -1; + p->rship->rx_dev_idx = -1; + p->rship->rx_dev_queue_idx = -1; + + spin_lock_init(&p->rship->net_lock); + init_task_work(&p->rship->cb.twork, task_net_relationship_work); + return 0; +} + +void sched_relationship_free(struct task_struct *p) +{ + kfree(p->rship); + p->rship = NULL; +}
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9GZAQ CVE: NA
--------------------------------
Introduce bpf helper to get task relationship. Now, can get some relationship data for memory share and network communication.
Signed-off-by: Hui Tang tanghui20@huawei.com --- include/linux/sched/relationship.h | 28 ++++++++++++++++++++++++++++ include/uapi/linux/bpf.h | 7 +++++++ kernel/sched/bpf_sched.c | 29 +++++++++++++++++++++++++++++ kernel/sched/fair.c | 26 ++++++++++++++++++++++++++ kernel/sched/relationship.c | 22 ++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 2 ++ tools/include/uapi/linux/bpf.h | 7 +++++++ 7 files changed, 121 insertions(+)
diff --git a/include/linux/sched/relationship.h b/include/linux/sched/relationship.h index df3f3f7814cd..f603ed71e3e4 100644 --- a/include/linux/sched/relationship.h +++ b/include/linux/sched/relationship.h @@ -20,6 +20,30 @@ struct fault_array_info { unsigned long val; };
+struct relationship_comm { + int nr_tasks; + int gid; + nodemask_t preferred_node; +}; + +struct bpf_net_relationship { + struct relationship_comm comm; + unsigned long grp_rxtx_bytes; + unsigned long grp_remote_rxtx_bytes; +}; + +struct bpf_mm_relationship { + struct relationship_comm comm; + unsigned long grp_total_faults; + struct fault_array_info grp_faults_ordered[FAULT_NODES_MAX]; + struct fault_array_info grp_score_ordered[FAULT_NODES_MAX]; +}; + +struct bpf_relationship_get_args { + struct bpf_mm_relationship mm; + struct bpf_net_relationship net; +}; + struct relationship_hdr { refcount_t refcount; spinlock_t lock; @@ -103,6 +127,10 @@ extern void sched_relationship_free(struct task_struct *p); void task_relationship_free(struct task_struct *tsk, bool reset); extern bool task_relationship_supported(struct task_struct *tsk); extern int sched_net_relationship_submit(struct net_relationship_req *req); +extern void sched_get_mm_relationship(struct task_struct *tsk, + struct bpf_relationship_get_args *args); +extern void sched_get_relationship(struct task_struct *tsk, + struct bpf_relationship_get_args *args); extern void numa_faults_update_and_sort(int nid, int new, struct fault_array_info *stats);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e608b32d4c5a..1274fe6d3ab8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3902,6 +3902,12 @@ union bpf_attr { * set online nodes to nodemask_t *op->arg1*, 0 returned. * Return * View above. + * + * int bpf_get_task_relationship_stats(struct task_struct *tsk, struct bpf_map *map, struct bpf_relationship_get_args *stats) + * Description + * get relationship statistics of *tsk* and store in *stats*. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4075,6 +4081,7 @@ union bpf_attr { FN(cpumask_op), \ FN(cpus_share_cache), \ FN(nodemask_op), \ + FN(get_task_relationship_stats),\ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index a1e72533d5b6..3cff265526b2 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -346,6 +346,31 @@ static const struct bpf_func_proto bpf_cpus_share_cache_proto = { .arg2_type = ARG_ANYTHING, };
+#ifdef CONFIG_SCHED_TASK_RELATIONSHIP +BPF_CALL_3(bpf_get_task_relationship_stats, struct task_struct *, tsk, + struct bpf_map *, map, struct bpf_relationship_get_args *, args) +{ + if (!task_relationship_supported(tsk)) + return -EPERM; + + if (!args) + return -EINVAL; + + sched_get_relationship(tsk, args); + return 0; +} + +const struct bpf_func_proto bpf_get_task_relationship_stats_proto = { + .func = bpf_get_task_relationship_stats, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_task_ids[0], + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, +}; +#endif + static const struct bpf_func_proto * bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -370,6 +395,10 @@ bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_cpus_share_cache_proto; case BPF_FUNC_nodemask_op: return &bpf_nodemask_op_proto; +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + case BPF_FUNC_get_task_relationship_stats: + return &bpf_get_task_relationship_stats_proto; +#endif default: return bpf_base_func_proto(func_id); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f9aa00ec559e..b1660079992b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3042,6 +3042,32 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu)
#endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + +#ifdef CONFIG_BPF_SCHED +void sched_get_mm_relationship(struct task_struct *tsk, + struct bpf_relationship_get_args *args) +{ +#ifdef CONFIG_NUMA_BALANCING + struct numa_group *grp; + + grp = rcu_dereference(tsk->numa_group); + if (grp) { + args->mm.comm.gid = grp->gid; + args->mm.comm.nr_tasks = grp->nr_tasks; + args->mm.grp_total_faults = grp->total_faults; + args->mm.comm.preferred_node = grp->preferred_nid; + memcpy(args->mm.grp_faults_ordered, grp->faults_ordered, + sizeof(args->mm.grp_faults_ordered)); + memcpy(args->mm.grp_score_ordered, grp->score_ordered, + sizeof(args->mm.grp_score_ordered)); + } +#endif +} +#endif + +#endif + #ifdef CONFIG_QOS_SCHED_PRIO_LB static __always_inline void adjust_rq_cfs_tasks(void (*list_op)(struct list_head *, struct list_head *), diff --git a/kernel/sched/relationship.c b/kernel/sched/relationship.c index 01879e3272de..89feb4b218d4 100644 --- a/kernel/sched/relationship.c +++ b/kernel/sched/relationship.c @@ -344,6 +344,28 @@ void numa_faults_update_and_sort(int nid, int new, sort(stats, nodes, sizeof(stats[0]), cmp_fault_stats, NULL); }
+void sched_get_relationship(struct task_struct *tsk, + struct bpf_relationship_get_args *args) +{ + struct net_group *ngrp; + + rcu_read_lock(); + + /* memory relationship */ + sched_get_mm_relationship(tsk, args); + + /* net relationship */ + ngrp = rcu_dereference(tsk->rship->net_group); + if (ngrp) { + args->net.comm.gid = ngrp->hdr.gid; + args->net.comm.nr_tasks = ngrp->hdr.nr_tasks; + args->net.comm.preferred_node = ngrp->hdr.preferred_nid; + args->net.grp_rxtx_bytes = ngrp->rxtx_bytes; + } + + rcu_read_unlock(); +} + void task_relationship_free(struct task_struct *tsk, bool reset) { if (!task_relationship_used()) diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 929a3031b4d6..db43107ba6f0 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -446,6 +446,7 @@ class PrinterHelpers(Printer): 'struct sched_affine_ctx', 'struct sched_migrate_node', 'struct nodemask_op_args', + 'struct bpf_relationship_get_args', ] known_types = { '...', @@ -500,6 +501,7 @@ class PrinterHelpers(Printer): 'struct sched_affine_ctx', 'struct sched_migrate_node', 'struct nodemask_op_args', + 'struct bpf_relationship_get_args', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 809723603ba8..ac08b57826a2 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3902,6 +3902,12 @@ union bpf_attr { * set online nodes to nodemask_t *op->arg1*, 0 returned. * Return * View above. + * + * int bpf_get_task_relationship_stats(struct task_struct *tsk, struct bpf_map *map, struct bpf_relationship_get_args *stats) + * Description + * get relationship statistics of *tsk* and store in *stats*. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4075,6 +4081,7 @@ union bpf_attr { FN(cpumask_op), \ FN(cpus_share_cache), \ FN(nodemask_op), \ + FN(get_task_relationship_stats),\ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9GZAQ CVE: NA
--------------------------------
Introduce ioctl interfaces to get relationship for task, which facilitating the acquisition and use of relationship in user mode.
Signed-off-by: Hui Tang tanghui20@huawei.com --- include/linux/sched/relationship.h | 7 ++ include/uapi/linux/sched_ctrl.h | 57 ++++++++++++ kernel/sched/Makefile | 2 +- kernel/sched/fair.c | 49 ++++++++++ kernel/sched/relationship.c | 31 +++++++ kernel/sched/relationship_ioctl.c | 142 +++++++++++++++++++++++++++++ 6 files changed, 287 insertions(+), 1 deletion(-) create mode 100644 include/uapi/linux/sched_ctrl.h create mode 100644 kernel/sched/relationship_ioctl.c
diff --git a/include/linux/sched/relationship.h b/include/linux/sched/relationship.h index f603ed71e3e4..45861f66ac4e 100644 --- a/include/linux/sched/relationship.h +++ b/include/linux/sched/relationship.h @@ -5,6 +5,7 @@ #include <linux/nodemask.h> #include <linux/jump_label.h> #include <linux/refcount.h> +#include <uapi/linux/sched_ctrl.h>
#define FAULT_NODES_MAX 4
@@ -127,6 +128,12 @@ extern void sched_relationship_free(struct task_struct *p); void task_relationship_free(struct task_struct *tsk, bool reset); extern bool task_relationship_supported(struct task_struct *tsk); extern int sched_net_relationship_submit(struct net_relationship_req *req); +extern void +sctl_sched_get_net_relationship(struct task_struct *tsk, + struct sctl_net_relationship_info *info); +extern void +sctl_sched_get_mem_relationship(struct task_struct *tsk, + struct sctl_mem_relationship_info *info); extern void sched_get_mm_relationship(struct task_struct *tsk, struct bpf_relationship_get_args *args); extern void sched_get_relationship(struct task_struct *tsk, diff --git a/include/uapi/linux/sched_ctrl.h b/include/uapi/linux/sched_ctrl.h new file mode 100644 index 000000000000..13a4eb182d5e --- /dev/null +++ b/include/uapi/linux/sched_ctrl.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _LINUX_SCHED_CTRL_H +#define _LINUX_SCHED_CTRL_H + +#include <linux/types.h> + + +#define SCTL_IOC_MAGIC 'X' + +/* get task relationship */ +#define SCTL_GET_RSHIP \ + _IOR(SCTL_IOC_MAGIC, 0, struct sctl_get_relationship_args) + +#define SCTL_IOC_MAXNR 1 + +#define SCTL_MAX_NUMNODES 16 +#define SCTL_STR_MAX 64 +#define NR_TASK_FAULTS_TYPE 2 + +#define NO_RSHIP (-1) + +struct grp_hdr { + int gid; + char preferred_nid[SCTL_STR_MAX]; + int nr_tasks; +}; + +struct sctl_net_relationship_info { + int valid; + struct grp_hdr grp_hdr; + int nic_nid; + int rx_dev_idx; + int rx_dev_queue_idx; + unsigned long rx_dev_netns_cookie; + unsigned long rxtx_remote_bytes; + unsigned long rxtx_bytes; + unsigned long grp_rxtx_bytes; +}; + +struct sctl_mem_relationship_info { + int valid; + struct grp_hdr grp_hdr; + int nodes_num; + unsigned long total_faults; + unsigned long grp_total_faults; + unsigned long faults[SCTL_MAX_NUMNODES][NR_TASK_FAULTS_TYPE]; + unsigned long faults_cpu[SCTL_MAX_NUMNODES][NR_TASK_FAULTS_TYPE]; + unsigned long grp_faults[SCTL_MAX_NUMNODES][NR_TASK_FAULTS_TYPE]; + unsigned long grp_faults_cpu[SCTL_MAX_NUMNODES][NR_TASK_FAULTS_TYPE]; +}; + +struct sctl_get_relationship_args { + int tid; + struct sctl_net_relationship_info nrsi; + struct sctl_mem_relationship_info mrsi; +}; +#endif /* _LINUX_SCHED_CTRL_H */ diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 114dc36320c6..879c22e63c6c 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -40,4 +40,4 @@ obj-$(CONFIG_SCHED_CORE) += core_sched.o obj-$(CONFIG_BPF_SCHED) += bpf_sched.o obj-$(CONFIG_BPF_SCHED) += bpf_topology.o obj-$(CONFIG_QOS_SCHED_SMART_GRID) += grid/ -obj-$(CONFIG_SCHED_TASK_RELATIONSHIP) += relationship.o +obj-$(CONFIG_SCHED_TASK_RELATIONSHIP) += relationship.o relationship_ioctl.o diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b1660079992b..439eb7f9791d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3043,6 +3043,55 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu) #endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_SCHED_TASK_RELATIONSHIP +void sctl_sched_get_mem_relationship(struct task_struct *tsk, + struct sctl_mem_relationship_info *info) +{ +#ifdef CONFIG_NUMA_BALANCING + struct task_relationship *rship = tsk->rship; + int nid, priv, cpu_idx, mem_idx; + struct numa_group *grp; + + info->valid = false; + if (unlikely(!rship) || !tsk->numa_faults) + return; + + memset(info, 0, sizeof(*info)); + info->valid = true; + info->nodes_num = nr_node_ids; + info->grp_hdr.gid = NO_RSHIP; + info->total_faults = tsk->total_numa_faults; + + rcu_read_lock(); + + grp = rcu_dereference(tsk->numa_group); + if (grp) { + info->grp_hdr.gid = grp->gid; + info->grp_hdr.nr_tasks = grp->nr_tasks; + snprintf(info->grp_hdr.preferred_nid, SCTL_STR_MAX, "%*pbl", + nodemask_pr_args(&grp->preferred_nid)); + } + + for_each_online_node(nid) { + if (nid >= SCTL_MAX_NUMNODES) + break; + + for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { + cpu_idx = task_faults_idx(NUMA_CPU, nid, priv); + mem_idx = task_faults_idx(NUMA_MEM, nid, priv); + info->faults[nid][priv] = tsk->numa_faults[mem_idx]; + info->faults_cpu[nid][priv] = tsk->numa_faults[cpu_idx]; + + if (grp) { + info->grp_faults[nid][priv] = grp->faults[mem_idx]; + info->grp_faults_cpu[nid][priv] = grp->faults_cpu[mem_idx]; + info->grp_total_faults = grp->total_faults; + } + } + } + + rcu_read_unlock(); +#endif +}
#ifdef CONFIG_BPF_SCHED void sched_get_mm_relationship(struct task_struct *tsk, diff --git a/kernel/sched/relationship.c b/kernel/sched/relationship.c index 89feb4b218d4..a85f85794f6e 100644 --- a/kernel/sched/relationship.c +++ b/kernel/sched/relationship.c @@ -366,6 +366,37 @@ void sched_get_relationship(struct task_struct *tsk, rcu_read_unlock(); }
+void sctl_sched_get_net_relationship(struct task_struct *tsk, + struct sctl_net_relationship_info *info) +{ + struct task_relationship *rship = tsk->rship; + struct net_group *grp; + + memset(info, 0, sizeof(*info)); + info->valid = true; + info->nic_nid = rship->nic_nid; + info->rx_dev_idx = rship->rx_dev_idx; + info->rx_dev_queue_idx = rship->rx_dev_queue_idx; + info->rx_dev_netns_cookie = rship->rx_dev_netns_cookie; + info->rxtx_remote_bytes = rship->rxtx_remote_bytes; + info->rxtx_bytes = rship->rxtx_bytes; + + info->grp_hdr.gid = NO_RSHIP; + + rcu_read_lock(); + + grp = rcu_dereference(rship->net_group); + if (grp) { + info->grp_hdr.gid = grp->hdr.gid; + info->grp_hdr.nr_tasks = grp->hdr.nr_tasks; + snprintf(info->grp_hdr.preferred_nid, SCTL_STR_MAX, "%*pbl", + nodemask_pr_args(&grp->hdr.preferred_nid)); + info->grp_rxtx_bytes = grp->rxtx_bytes; + } + + rcu_read_unlock(); +} + void task_relationship_free(struct task_struct *tsk, bool reset) { if (!task_relationship_used()) diff --git a/kernel/sched/relationship_ioctl.c b/kernel/sched/relationship_ioctl.c new file mode 100644 index 000000000000..229786961ec8 --- /dev/null +++ b/kernel/sched/relationship_ioctl.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Common code for support ioctl for schedluler + * + * Copyright (C) 2023-2024 Huawei Technologies Co., Ltd + * + * Author: Hui Tang tanghui20@huawei.com + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#include <linux/string.h> +#include <linux/device.h> +#include <linux/miscdevice.h> +#include <linux/fs.h> +#include <linux/compat.h> + +#include "sched.h" + +static int sched_ctl_open(struct inode *inode, struct file *filp) +{ + filp->private_data = NULL; + + return 0; +} + +static int sched_ctl_release(struct inode *inode, struct file *filp) +{ + return 0; +} + +static int sched_ctrl_get_relationship(void __user *arg) +{ + struct sctl_get_relationship_args data; + struct task_struct *tsk; + pid_t pid; + + if (!task_relationship_used()) { + pr_err("task relationship disabled!\n"); + return -EPERM; + } + + if (copy_from_user(&data, arg, sizeof(data))) { + pr_err("fail to copy_from_user!\n"); + return -EFAULT; + } + + pid = data.tid; + + rcu_read_lock(); + + tsk = find_task_by_vpid(pid); + if (!tsk) { + rcu_read_unlock(); + return -ESRCH; + } + + if (!task_relationship_supported(tsk)) { + rcu_read_unlock(); + return -EPERM; + } + + sctl_sched_get_net_relationship(tsk, &data.nrsi); + sctl_sched_get_mem_relationship(tsk, &data.mrsi); + + rcu_read_unlock(); + + if (copy_to_user(arg, &data, sizeof(data))) { + pr_err("fail to copy_to_user!\n"); + return -EFAULT; + } + + return 0; +} + +static long sched_ctl_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + int ret = 0; + struct sched_ctl_data *data; + + if (_IOC_TYPE(cmd) != SCTL_IOC_MAGIC) + return -ENOTTY; + + if (_IOC_NR(cmd) > SCTL_IOC_MAXNR) + return -ENOTTY; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + data = filp->private_data; + + switch (cmd) { + case SCTL_GET_RSHIP: + ret = sched_ctrl_get_relationship((void __user *)(uintptr_t)arg); + break; + default: + ret = -EINVAL; + + } + + return ret; +} + +#ifdef CONFIG_COMPAT +static long +sched_ctl_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + arg = (unsigned long)(uintptr_t)compat_ptr(arg); + return sched_ctl_ioctl(file, cmd, arg); +} +#endif /* CONFIG_COMPAT */ + +static const struct file_operations sched_ctl_fops = { + .open = sched_ctl_open, + .release = sched_ctl_release, + .llseek = no_llseek, + .unlocked_ioctl = sched_ctl_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = sched_ctl_compat_ioctl, +#endif +}; + +static struct miscdevice sched_ctl_device = { + .minor = MISC_DYNAMIC_MINOR, + .name = "relationship_ctrl", + .fops = &sched_ctl_fops, +}; + +static int __init sched_ctl_device_init(void) +{ + return misc_register(&sched_ctl_device); +}; + +device_initcall(sched_ctl_device_init);
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9GZAQ CVE: NA
--------------------------------
Implements a mechanism to periodically adjust preferred numa nodes for relationship group, and allows different adjustment policies to be customized in bpf program.
Signed-off-by: Hui Tang tanghui20@huawei.com --- include/linux/sched/relationship.h | 24 +++++++++++ include/linux/sched_hook_defs.h | 2 + kernel/sched/fair.c | 67 +++++++++++++++++++++++++++++- kernel/sched/relationship.c | 4 ++ 4 files changed, 95 insertions(+), 2 deletions(-)
diff --git a/include/linux/sched/relationship.h b/include/linux/sched/relationship.h index 45861f66ac4e..fbc5c2bab5dc 100644 --- a/include/linux/sched/relationship.h +++ b/include/linux/sched/relationship.h @@ -113,7 +113,25 @@ struct task_relationship {
/* extras numa fault data */ struct numa_fault_ext faults; + +#ifdef CONFIG_NUMA_BALANCING + /* preferred nodes adjust */ + u64 node_stamp; + struct callback_head node_work; +#endif +}; + +#ifdef CONFIG_BPF_SCHED +struct sched_preferred_node_ctx { + struct task_struct *tsk; + nodemask_t preferred_node; + + KABI_RESERVE(1) + KABI_RESERVE(2) + KABI_RESERVE(3) + KABI_RESERVE(4) }; +#endif
extern void task_relationship_enable(void); extern void task_relationship_disable(void); @@ -140,6 +158,9 @@ extern void sched_get_relationship(struct task_struct *tsk, struct bpf_relationship_get_args *args); extern void numa_faults_update_and_sort(int nid, int new, struct fault_array_info *stats); +extern void task_tick_relationship(struct rq *rq, struct task_struct *curr); + +extern void task_preferred_node_work(struct callback_head *work);
DECLARE_STATIC_KEY_FALSE(__relationship_switch); static inline bool task_relationship_used(void) @@ -167,6 +188,9 @@ sched_net_relationship_submit(struct net_relationship_req *req) { return 0; } + +static inline void +task_tick_relationship(struct rq *rq, struct task_struct *curr) {} #endif
#endif diff --git a/include/linux/sched_hook_defs.h b/include/linux/sched_hook_defs.h index e2519a00aa6b..0a871f728c85 100644 --- a/include/linux/sched_hook_defs.h +++ b/include/linux/sched_hook_defs.h @@ -12,3 +12,5 @@ BPF_SCHED_HOOK(int, -1, cfs_wake_affine, struct sched_affine_ctx *ctx) BPF_SCHED_HOOK(int, -1, cfs_select_rq_exit, struct sched_migrate_ctx *ctx) BPF_SCHED_HOOK(int, -1, cfs_can_migrate_task, struct task_struct *p, struct sched_migrate_node *migrate_node) +BPF_SCHED_HOOK(void, (void) 0, cfs_change_preferred_node, + struct sched_preferred_node_ctx *ctx) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 439eb7f9791d..3322c4ececd0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1090,6 +1090,8 @@ struct numa_group { struct fault_array_info score_ordered[FAULT_NODES_MAX]; struct fault_array_info faults_ordered[FAULT_NODES_MAX]; nodemask_t preferred_nid; + u64 node_stamp; + u64 nodes_switch_cnt; #endif /* * Faults_cpu is used to decide whether memory should move @@ -2547,6 +2549,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, #ifdef CONFIG_SCHED_TASK_RELATIONSHIP if (task_relationship_used()) { grp->preferred_nid = NODE_MASK_NONE; + grp->node_stamp = jiffies; for (i = 0; i < FAULT_NODES_MAX; i++) { grp->faults_ordered[i].nid = -1; grp->score_ordered[i].nid = -1; @@ -13271,6 +13274,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) update_overutilized_status(task_rq(curr));
task_tick_core(rq, curr); + + task_tick_relationship(rq, curr); }
/* @@ -13861,8 +13866,9 @@ void sched_show_relationship(struct task_struct *p, struct seq_file *m)
ng = rcu_dereference(p->numa_group); if (ng) { - seq_printf(m, "numa group preferred nid %*pbl\n", - nodemask_pr_args(&ng->preferred_nid)); + seq_printf(m, "numa group preferred nid %*pbl switch_cnt %llu\n", + nodemask_pr_args(&ng->preferred_nid), + ng->nodes_switch_cnt); }
net_grp = rcu_dereference(p->rship->net_group); @@ -13877,6 +13883,63 @@ void sched_show_relationship(struct task_struct *p, struct seq_file *m) } #endif /* CONFIG_SCHED_DEBUG */
+#ifdef CONFIG_SCHED_TASK_RELATIONSHIP +void task_preferred_node_work(struct callback_head *work) +{ +#ifdef CONFIG_NUMA_BALANCING + struct task_struct *curr = current; + struct numa_group *numa_grp; +#ifdef CONFIG_BPF_SCHED + struct sched_preferred_node_ctx ctx = {0}; +#endif + + work->next = work; + +#ifdef CONFIG_BPF_SCHED + numa_grp = deref_curr_numa_group(curr); + if (numa_grp) { + + spin_lock_irq(&numa_grp->lock); + ctx.tsk = curr; + ctx.preferred_node = numa_grp->preferred_nid; + bpf_sched_cfs_change_preferred_node(&ctx); + spin_unlock_irq(&numa_grp->lock); + } +#endif +#endif +} + +void task_tick_relationship(struct rq *rq, struct task_struct *curr) +{ +#ifdef CONFIG_NUMA_BALANCING + struct callback_head *work = &curr->rship->node_work; + struct numa_group *numa_grp; + + if (!task_relationship_supported(curr)) + return; + + if (work->next != work) + return; + + numa_grp = deref_curr_numa_group(curr); + if (!numa_grp || numa_grp->nr_tasks <= 1) + return; + + spin_lock(&numa_grp->lock); + + if (time_after(jiffies, + (unsigned long)(numa_grp->node_stamp + msecs_to_jiffies(100)))) { + numa_grp->node_stamp = jiffies; + spin_unlock(&numa_grp->lock); + task_work_add(curr, &curr->rship->node_work, TWA_RESUME); + return; + } + + spin_unlock(&numa_grp->lock); +#endif +} +#endif + __init void init_sched_fair_class(void) { #ifdef CONFIG_QOS_SCHED diff --git a/kernel/sched/relationship.c b/kernel/sched/relationship.c index a85f85794f6e..515c913aeb33 100644 --- a/kernel/sched/relationship.c +++ b/kernel/sched/relationship.c @@ -422,6 +422,10 @@ int sched_relationship_fork(struct task_struct *p)
spin_lock_init(&p->rship->net_lock); init_task_work(&p->rship->cb.twork, task_net_relationship_work); +#ifdef CONFIG_NUMA_BALANCING + p->rship->node_work.next = &p->rship->node_work; + init_task_work(&p->rship->node_work, task_preferred_node_work); +#endif return 0; }
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9GZAQ CVE: NA
--------------------------------
Introduce the bpf helper to set preferred nodes for relationship group.
Signed-off-by: Hui Tang tanghui20@huawei.com --- include/linux/sched/relationship.h | 6 ++++++ include/uapi/linux/bpf.h | 7 +++++++ kernel/sched/bpf_sched.c | 20 ++++++++++++++++++++ kernel/sched/fair.c | 10 ++++++++++ scripts/bpf_helpers_doc.py | 4 ++++ tools/include/uapi/linux/bpf.h | 7 +++++++ tools/lib/bpf/libbpf_sched.h | 29 ++++++++++++++++++++++++++++- 7 files changed, 82 insertions(+), 1 deletion(-)
diff --git a/include/linux/sched/relationship.h b/include/linux/sched/relationship.h index fbc5c2bab5dc..43aa3f9706d4 100644 --- a/include/linux/sched/relationship.h +++ b/include/linux/sched/relationship.h @@ -45,6 +45,10 @@ struct bpf_relationship_get_args { struct bpf_net_relationship net; };
+struct bpf_relationship_set_args { + nodemask_t preferred_node; +}; + struct relationship_hdr { refcount_t refcount; spinlock_t lock; @@ -161,6 +165,8 @@ extern void numa_faults_update_and_sort(int nid, int new, extern void task_tick_relationship(struct rq *rq, struct task_struct *curr);
extern void task_preferred_node_work(struct callback_head *work); +extern void +sched_set_curr_preferred_node(struct bpf_relationship_set_args *args);
DECLARE_STATIC_KEY_FALSE(__relationship_switch); static inline bool task_relationship_used(void) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1274fe6d3ab8..8aba6670549c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3908,6 +3908,12 @@ union bpf_attr { * get relationship statistics of *tsk* and store in *stats*. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_set_curr_preferred_node(struct bpf_relationship_set_args *args, int len) + * Description + * set current task preferred node. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4082,6 +4088,7 @@ union bpf_attr { FN(cpus_share_cache), \ FN(nodemask_op), \ FN(get_task_relationship_stats),\ + FN(sched_set_curr_preferred_node),\ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 3cff265526b2..ac1b94ea6740 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -369,6 +369,24 @@ const struct bpf_func_proto bpf_get_task_relationship_stats_proto = { .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, }; + +BPF_CALL_2(bpf_sched_set_curr_preferred_node, + struct bpf_relationship_set_args *, args, int, len) +{ + if (!args || len != sizeof(*args)) + return -EINVAL; + + sched_set_curr_preferred_node(args); + return 0; +} + +const struct bpf_func_proto bpf_sched_set_curr_preferred_node_proto = { + .func = bpf_sched_set_curr_preferred_node, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE, +}; #endif
static const struct bpf_func_proto * @@ -398,6 +416,8 @@ bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #ifdef CONFIG_SCHED_TASK_RELATIONSHIP case BPF_FUNC_get_task_relationship_stats: return &bpf_get_task_relationship_stats_proto; + case BPF_FUNC_sched_set_curr_preferred_node: + return &bpf_sched_set_curr_preferred_node_proto; #endif default: return bpf_base_func_proto(func_id); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3322c4ececd0..f027d4197b14 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3116,6 +3116,16 @@ void sched_get_mm_relationship(struct task_struct *tsk, } #endif } + +void sched_set_curr_preferred_node(struct bpf_relationship_set_args *args) +{ +#ifdef CONFIG_NUMA_BALANCING + struct numa_group *grp = rcu_dereference_raw(current->numa_group); + + grp->preferred_nid = args->preferred_node; + schedstat_inc(grp->nodes_switch_cnt); +#endif +} #endif
#endif diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index db43107ba6f0..4f6fac621f65 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -447,6 +447,8 @@ class PrinterHelpers(Printer): 'struct sched_migrate_node', 'struct nodemask_op_args', 'struct bpf_relationship_get_args', + 'struct bpf_relationship_set_args', + 'struct sched_preferred_node_ctx', ] known_types = { '...', @@ -502,6 +504,8 @@ class PrinterHelpers(Printer): 'struct sched_migrate_node', 'struct nodemask_op_args', 'struct bpf_relationship_get_args', + 'struct bpf_relationship_set_args', + 'struct sched_preferred_node_ctx', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ac08b57826a2..5c04747f201c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3908,6 +3908,12 @@ union bpf_attr { * get relationship statistics of *tsk* and store in *stats*. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_set_curr_preferred_node(struct bpf_relationship_set_args *args, int len) + * Description + * set current task preferred node. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4082,6 +4088,7 @@ union bpf_attr { FN(cpus_share_cache), \ FN(nodemask_op), \ FN(get_task_relationship_stats),\ + FN(sched_set_curr_preferred_node),\ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/tools/lib/bpf/libbpf_sched.h b/tools/lib/bpf/libbpf_sched.h index 7fa5f03e6ba4..04af73b92856 100644 --- a/tools/lib/bpf/libbpf_sched.h +++ b/tools/lib/bpf/libbpf_sched.h @@ -17,6 +17,7 @@
#include <linux/bpf_topology.h> #include <linux/numa.h> +#include <linux/sched/relationship.h> #include <linux/version.h> #include <uapi/linux/bpf.h> #include <bpf/bpf_helpers.h> @@ -27,7 +28,7 @@ #define INVALID_PTR ((void *)(0UL)) #define getVal(P) \ ({ \ - typeof(P) val = 0; \ + typeof(P) val; \ bpf_probe_read_kernel(&val, sizeof(val), &(P)); \ val; \ }) @@ -79,6 +80,13 @@ struct { __uint(max_entries, 1); } map_cpumask_info SEC(".maps");
+static struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, struct bpf_relationship_get_args); + __uint(max_entries, 1); +} map_rship_stats SEC(".maps"); + static __always_inline void libbpf_nodes_and(nodemask_t *dst, nodemask_t *src1, nodemask_t *src2) { @@ -614,4 +622,23 @@ static __always_inline int libbpf_sched_se_tag_of(struct sched_entity *se)
return se_tag; } + +static __always_inline int +libbpf_mem_preferred_nid(struct task_struct *tsk, nodemask_t *preferred_node) +{ + struct bpf_relationship_get_args *stats; + int key = 0; + int ret; + + stats = bpf_map_lookup_elem(&map_rship_stats, &key); + if (!stats) + return NUMA_NO_NODE; + + ret = bpf_get_task_relationship_stats(tsk, &map_rship_stats, stats); + if (ret) + return NUMA_NO_NODE; + + *preferred_node = getVal(stats->mm.comm.preferred_node); + return 0; +} #endif
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9GZAQ CVE: NA
--------------------------------
Introduce NUMA isolation and consolidation. If enabled, scheduler will identify relationship between tasks, and track NUMA resource usage.
With 'numa_icon=enable/disable' to control the feature.
Signed-off-by: Hui Tang tanghui20@huawei.com --- include/linux/sched.h | 15 ++++ include/uapi/linux/bpf.h | 7 ++ init/Kconfig | 13 +++ kernel/sched/Makefile | 1 + kernel/sched/bpf_sched.c | 29 +++++++ kernel/sched/fair.c | 14 ++++ kernel/sched/numa_icon.c | 144 +++++++++++++++++++++++++++++++++ kernel/sched/numa_icon.h | 43 ++++++++++ kernel/sched/sched.h | 2 + scripts/bpf_helpers_doc.py | 2 + tools/include/uapi/linux/bpf.h | 7 ++ tools/lib/bpf/libbpf_sched.h | 24 ++++++ 12 files changed, 301 insertions(+) create mode 100644 kernel/sched/numa_icon.c create mode 100644 kernel/sched/numa_icon.h
diff --git a/include/linux/sched.h b/include/linux/sched.h index af43d8d55e1b..fa83018137ce 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2360,6 +2360,21 @@ struct bpf_sched_cpu_stats { KABI_RESERVE(4) };
+struct bpf_node_stats { + unsigned long util; + unsigned long compute_capacity; + unsigned int weight; + + KABI_RESERVE(1) + KABI_RESERVE(2) + KABI_RESERVE(3) + KABI_RESERVE(4) + KABI_RESERVE(5) + KABI_RESERVE(6) + KABI_RESERVE(7) + KABI_RESERVE(8) +}; + struct cpumask_op_args { unsigned int op_type; void *arg1; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8aba6670549c..b87934003c40 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3914,6 +3914,12 @@ union bpf_attr { * set current task preferred node. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_get_node_stats(int nid, struct bpf_node_stats *ctx, int len) + * Description + * get resource statistics of *nid* and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4089,6 +4095,7 @@ union bpf_attr { FN(nodemask_op), \ FN(get_task_relationship_stats),\ FN(sched_set_curr_preferred_node),\ + FN(get_node_stats), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/init/Kconfig b/init/Kconfig index ea9a6e93155b..758b9988d742 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1091,6 +1091,19 @@ config SCHED_TASK_RELATIONSHIP
If in doubt, say N.
+config QOS_SCHED_NUMA_ICON + bool "numa aware schedule" + depends on BPF_SCHED + depends on SCHED_TASK_RELATIONSHIP + default n + help + This feature provides the NUMA Isolation and Consolidationthe + Mechanisms based on ebpf and task relationship. If enabled, scheduler + places related tasks on same numa node when the node has spare + resource. + + If in doubt, say N. + config UCLAMP_TASK_GROUP bool "Utilization clamping per group of tasks" depends on CGROUP_SCHED diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 879c22e63c6c..ff9ff2c17f79 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -41,3 +41,4 @@ obj-$(CONFIG_BPF_SCHED) += bpf_sched.o obj-$(CONFIG_BPF_SCHED) += bpf_topology.o obj-$(CONFIG_QOS_SCHED_SMART_GRID) += grid/ obj-$(CONFIG_SCHED_TASK_RELATIONSHIP) += relationship.o relationship_ioctl.o +obj-$(CONFIG_QOS_SCHED_NUMA_ICON) += numa_icon.o diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index ac1b94ea6740..3e14d1fa911e 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -346,6 +346,31 @@ static const struct bpf_func_proto bpf_cpus_share_cache_proto = { .arg2_type = ARG_ANYTHING, };
+#ifdef CONFIG_QOS_SCHED_NUMA_ICON +BPF_CALL_3(bpf_get_node_stats, int, nid, + struct bpf_node_stats *, ctx, + int, len) +{ + if (len != sizeof(*ctx)) + return -EINVAL; + + if ((unsigned int)nid >= nr_node_ids) + return -EINVAL; + + sched_get_node_load(nid, ctx); + return 0; +} + +const struct bpf_func_proto bpf_get_node_stats_proto = { + .func = bpf_get_node_stats, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; +#endif + #ifdef CONFIG_SCHED_TASK_RELATIONSHIP BPF_CALL_3(bpf_get_task_relationship_stats, struct task_struct *, tsk, struct bpf_map *, map, struct bpf_relationship_get_args *, args) @@ -413,6 +438,10 @@ bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_cpus_share_cache_proto; case BPF_FUNC_nodemask_op: return &bpf_nodemask_op_proto; +#ifdef CONFIG_QOS_SCHED_NUMA_ICON + case BPF_FUNC_get_node_stats: + return &bpf_get_node_stats_proto; +#endif #ifdef CONFIG_SCHED_TASK_RELATIONSHIP case BPF_FUNC_get_task_relationship_stats: return &bpf_get_task_relationship_stats_proto; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f027d4197b14..7289ae80c936 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3954,6 +3954,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
cfs_rq_util_change(cfs_rq, 0);
+ numa_load_change(cfs_rq); + trace_pelt_cfs_tp(cfs_rq); }
@@ -3984,6 +3986,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
cfs_rq_util_change(cfs_rq, 0);
+ numa_load_change(cfs_rq); + trace_pelt_cfs_tp(cfs_rq); }
@@ -4024,6 +4028,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
} else if (decayed) { cfs_rq_util_change(cfs_rq, 0); + numa_load_change(cfs_rq);
if (flags & UPDATE_TG) update_tg_load_avg(cfs_rq); @@ -13286,6 +13291,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_core(rq, curr);
task_tick_relationship(rq, curr); + + update_numa_capacity(rq); }
/* @@ -13868,6 +13875,7 @@ void sched_show_relationship(struct task_struct *p, struct seq_file *m) #ifdef CONFIG_SCHED_TASK_RELATIONSHIP struct net_group *net_grp; struct numa_group *ng; + int node;
if (!task_relationship_used()) return; @@ -13889,6 +13897,10 @@ void sched_show_relationship(struct task_struct *p, struct seq_file *m) }
rcu_read_unlock(); + + for_each_online_node(node) { + print_node_load_info(m, node); + } #endif } #endif /* CONFIG_SCHED_DEBUG */ @@ -13959,6 +13971,8 @@ __init void init_sched_fair_class(void) INIT_LIST_HEAD(&per_cpu(qos_throttled_cfs_rq, i)); #endif
+ init_sched_numa_icon(); + #ifdef CONFIG_SMP open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
diff --git a/kernel/sched/numa_icon.c b/kernel/sched/numa_icon.c new file mode 100644 index 000000000000..e9825ac7f866 --- /dev/null +++ b/kernel/sched/numa_icon.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Common code for task numa isolation consolidation + * + * Copyright (C) 2023-2024 Huawei Technologies Co., Ltd + * + * Author: Hui Tang tanghui20@huawei.com + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#include "sched.h" + +static bool __sched_numa_icon_switch __initdata; +DEFINE_STATIC_KEY_FALSE(sched_numa_icon_switch); + +struct node_load_info *node_load_ptr; + +static void set_numa_icon_switch(bool enabled) +{ + if (enabled) { + static_branch_enable(&sched_numa_icon_switch); + task_relationship_enable(); + } else { + static_branch_disable(&sched_numa_icon_switch); + task_relationship_disable(); + } +} + +static int __init numa_icon_switch_setup(char *str) +{ + int ret = 0; + + if (!str) + goto out; + + /* + * This code is called before jump labels have been set up, so we can't + * change the static branch directly just yet. Instead set a temporary + * variable so init_numa_icon_switch() can do it later. + */ + if (!strcmp(str, "enable")) { + __sched_numa_icon_switch = true; + ret = 1; + } else if (!strcmp(str, "disable")) { + __sched_numa_icon_switch = false; + ret = 1; + } +out: + if (!ret) + pr_warn("Unable to parse numa_icon=\n"); + + return ret; +} +__setup("numa_icon=", numa_icon_switch_setup); + +__init void init_sched_numa_icon(void) +{ + int i; + + set_numa_icon_switch(__sched_numa_icon_switch); + + if (!sched_numa_icon_enabled()) + return; + + node_load_ptr = kcalloc(nr_node_ids, sizeof(struct node_load_info), + GFP_KERNEL); + + for (i = 0; i < nr_node_ids; i++) { + raw_spin_lock_init(&node_load_ptr[i].lock); + node_load_ptr[i].util_avg_last = + kcalloc(nr_cpu_ids, sizeof(struct sched_avg), GFP_KERNEL); + } + + for_each_possible_cpu(i) { + node_load_ptr[cpu_to_node(i)].compute_capacity += + SCHED_CAPACITY_SCALE; + } +} + +void print_node_load_info(struct seq_file *m, int node) +{ + if (!sched_numa_icon_enabled()) + return; + + seq_printf(m, "node %d capacity=%lu util_avg=%lu\n", node, + node_load_ptr[node].compute_capacity, + atomic_long_read(&node_load_ptr[node].util_avg)); +} + +void numa_load_change(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); + int nid = cpu_to_node(cpu); + struct sched_avg *avg_old; + long delta; + + if (!sched_numa_icon_enabled()) + return; + + avg_old = &node_load_ptr[nid].util_avg_last[cpu]; + + if (&rq->cfs != cfs_rq) + return; + + delta = cfs_rq->avg.util_avg - avg_old->util_avg; + atomic_long_add(delta, &node_load_ptr[nid].util_avg); + avg_old->util_avg = cfs_rq->avg.util_avg; +} + +void update_numa_capacity(struct rq *rq) +{ + int cpu = cpu_of(rq); + int nid = cpu_to_node(cpu); + unsigned long capacity = 0; + + if (!sched_numa_icon_enabled()) + return; + + if (cpu != cpumask_first(cpumask_of_node(nid))) + return; + + for_each_cpu(cpu, cpumask_of_node(nid)) { + capacity += cpu_rq(cpu)->cpu_capacity; + } + node_load_ptr[nid].compute_capacity = capacity; +} + +#ifdef CONFIG_BPF_SCHED +void sched_get_node_load(int nid, struct bpf_node_stats *ctx) +{ + ctx->util = atomic_long_read(&node_load_ptr[nid].util_avg); + ctx->compute_capacity = node_load_ptr[nid].compute_capacity; + ctx->weight = cpumask_weight(cpumask_of_node(nid)); +} +#endif diff --git a/kernel/sched/numa_icon.h b/kernel/sched/numa_icon.h new file mode 100644 index 000000000000..adeed53e9f14 --- /dev/null +++ b/kernel/sched/numa_icon.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_NUMA_ICON_H +#include <linux/sched.h> + +struct node_load_info { + raw_spinlock_t lock ____cacheline_aligned; + atomic_long_t util_avg; + unsigned long compute_capacity; + struct sched_avg *util_avg_last; +}; + +#ifdef CONFIG_QOS_SCHED_NUMA_ICON +extern struct static_key_false sched_numa_icon_switch; +static __always_inline bool sched_numa_icon_enabled(void) +{ + return static_branch_unlikely(&sched_numa_icon_switch); +} + +extern void print_node_load_info(struct seq_file *m, int node); +extern __init void init_sched_numa_icon(void); +extern void sched_get_node_load(int nid, struct bpf_node_stats *ctx); +extern void init_node_load(struct rq *rq); +extern void numa_load_change(struct cfs_rq *cfs_rq); +extern void update_numa_capacity(struct rq *rq); + +#else /* !CONFIG_QOS_SCHED_NUMA_ICON */ +static inline void init_sched_numa_icon(void) {} + +static inline void init_node_load(struct rq *rq) {} + +static inline void numa_load_change(struct cfs_rq *cfs_rq) {} + +static inline void update_numa_capacity(struct rq *rq) {} + +static inline void print_node_load_info(struct seq_file *m, int node) {} + +static __always_inline bool sched_numa_icon_enabled(void) +{ + return false; +} +#endif /* CONFIG_QOS_SCHED_NUMA_ICON */ + +#endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e6f934af7062..3b2fc472908a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -76,6 +76,8 @@
#include "cpupri.h" #include "cpudeadline.h" +#include "numa_icon.h" +#include <uapi/linux/sched_ctrl.h>
#include <trace/events/sched.h>
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 4f6fac621f65..3afc3e354844 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -449,6 +449,7 @@ class PrinterHelpers(Printer): 'struct bpf_relationship_get_args', 'struct bpf_relationship_set_args', 'struct sched_preferred_node_ctx', + 'struct bpf_node_stats', ] known_types = { '...', @@ -506,6 +507,7 @@ class PrinterHelpers(Printer): 'struct bpf_relationship_get_args', 'struct bpf_relationship_set_args', 'struct sched_preferred_node_ctx', + 'struct bpf_node_stats', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 5c04747f201c..5a153a1a8f18 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3914,6 +3914,12 @@ union bpf_attr { * set current task preferred node. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_get_node_stats(int nid, struct bpf_node_stats *ctx, int len) + * Description + * get resource statistics of *nid* and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4089,6 +4095,7 @@ union bpf_attr { FN(nodemask_op), \ FN(get_task_relationship_stats),\ FN(sched_set_curr_preferred_node),\ + FN(get_node_stats), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/tools/lib/bpf/libbpf_sched.h b/tools/lib/bpf/libbpf_sched.h index 04af73b92856..3e9b41788637 100644 --- a/tools/lib/bpf/libbpf_sched.h +++ b/tools/lib/bpf/libbpf_sched.h @@ -623,6 +623,30 @@ static __always_inline int libbpf_sched_se_tag_of(struct sched_entity *se) return se_tag; }
+static __always_inline unsigned long libbpf_node_cfs_util_of(int nid) +{ + struct bpf_node_stats stats = {0}; + + bpf_get_node_stats(nid, &stats, sizeof(stats)); + return getVal(stats.util); +} + +static __always_inline unsigned long libbpf_node_cfs_capacity_of(int nid) +{ + struct bpf_node_stats stats = {0}; + + bpf_get_node_stats(nid, &stats, sizeof(stats)); + return getVal(stats.compute_capacity); +} + +static __always_inline unsigned int libbpf_node_weight_of(int nid) +{ + struct bpf_node_stats stats = {0}; + + bpf_get_node_stats(nid, &stats, sizeof(stats)); + return getVal(stats.weight); +} + static __always_inline int libbpf_mem_preferred_nid(struct task_struct *tsk, nodemask_t *preferred_node) {
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9GZAQ CVE: NA
--------------------------------
Enable CONFIG_QOS_SCHED_NUMA_ICON and CONFIG_BPF_SCHED for arm64 by default.
Signed-off-by: Hui Tang tanghui20@huawei.com --- arch/arm64/configs/openeuler_defconfig | 4 +++- arch/x86/configs/openeuler_defconfig | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 34061d75a0d2..745897d5f4d6 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -162,6 +162,8 @@ CONFIG_FAIR_GROUP_SCHED=y CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y CONFIG_QOS_SCHED_DYNAMIC_AFFINITY=y +CONFIG_SCHED_TASK_RELATIONSHIP=y +CONFIG_QOS_SCHED_NUMA_ICON=y CONFIG_QOS_SCHED_SMART_GRID=y CONFIG_CGROUP_PIDS=y CONFIG_CGROUP_RDMA=y @@ -234,7 +236,7 @@ CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y CONFIG_KALLSYMS_BASE_RELATIVE=y # CONFIG_BPF_LSM is not set -# CONFIG_BPF_SCHED is not set +CONFIG_BPF_SCHED=y CONFIG_BPF_SYSCALL=y CONFIG_ARCH_WANT_DEFAULT_BPF_JIT=y CONFIG_BPF_JIT_ALWAYS_ON=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 1835f38f2947..3c9d3d4e3964 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -167,6 +167,8 @@ CONFIG_FAIR_GROUP_SCHED=y CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y CONFIG_QOS_SCHED_DYNAMIC_AFFINITY=y +# CONFIG_SCHED_TASK_RELATIONSHIP is not set +# CONFIG_QOS_SCHED_NUMA_ICON is not set # CONFIG_QOS_SCHED_SMART_GRID is not set CONFIG_CGROUP_PIDS=y CONFIG_CGROUP_RDMA=y
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,转换为PR失败! 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/P... 失败原因:应用补丁/补丁集失败,Patch failed at 0001 sched: Introduce CONFIG_TASK_PLACEMENT_BY_CPU_RANGE 建议解决方法:请查看失败原因, 确认补丁是否可以应用在当前期望分支的最新代码上
FeedBack: The patch(es) which you have sent to kernel@openeuler.org has been converted to PR failed! Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/P... Failed Reason: apply patch(es) failed, Patch failed at 0001 sched: Introduce CONFIG_TASK_PLACEMENT_BY_CPU_RANGE Suggest Solution: please checkout if the failed patch(es) can work on the newest codes in expected branch