hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9GZAQ CVE: NA
--------------------------------
Introduce NUMA isolation and consolidation. If enabled, scheduler will identify relationship between tasks, and track NUMA resource usage.
With 'numa_icon=enable/disable' to control the feature.
Signed-off-by: Hui Tang tanghui20@huawei.com --- include/linux/sched.h | 15 ++++ include/uapi/linux/bpf.h | 7 ++ init/Kconfig | 13 +++ kernel/sched/Makefile | 1 + kernel/sched/bpf_sched.c | 29 +++++++ kernel/sched/fair.c | 14 ++++ kernel/sched/numa_icon.c | 144 +++++++++++++++++++++++++++++++++ kernel/sched/numa_icon.h | 43 ++++++++++ kernel/sched/sched.h | 2 + scripts/bpf_helpers_doc.py | 2 + tools/include/uapi/linux/bpf.h | 7 ++ tools/lib/bpf/libbpf_sched.h | 24 ++++++ 12 files changed, 301 insertions(+) create mode 100644 kernel/sched/numa_icon.c create mode 100644 kernel/sched/numa_icon.h
diff --git a/include/linux/sched.h b/include/linux/sched.h index af43d8d55e1b..fa83018137ce 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2360,6 +2360,21 @@ struct bpf_sched_cpu_stats { KABI_RESERVE(4) };
+struct bpf_node_stats { + unsigned long util; + unsigned long compute_capacity; + unsigned int weight; + + KABI_RESERVE(1) + KABI_RESERVE(2) + KABI_RESERVE(3) + KABI_RESERVE(4) + KABI_RESERVE(5) + KABI_RESERVE(6) + KABI_RESERVE(7) + KABI_RESERVE(8) +}; + struct cpumask_op_args { unsigned int op_type; void *arg1; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8aba6670549c..b87934003c40 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3914,6 +3914,12 @@ union bpf_attr { * set current task preferred node. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_get_node_stats(int nid, struct bpf_node_stats *ctx, int len) + * Description + * get resource statistics of *nid* and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4089,6 +4095,7 @@ union bpf_attr { FN(nodemask_op), \ FN(get_task_relationship_stats),\ FN(sched_set_curr_preferred_node),\ + FN(get_node_stats), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/init/Kconfig b/init/Kconfig index 7bcc7d5a8584..b722b7a887c1 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1090,6 +1090,19 @@ config SCHED_TASK_RELATIONSHIP
If in doubt, say N.
+config QOS_SCHED_NUMA_ICON + bool "numa aware schedule" + depends on BPF_SCHED + depends on SCHED_TASK_RELATIONSHIP + default n + help + This feature provides the NUMA Isolation and Consolidationthe + Mechanisms based on ebpf and task relationship. If enabled, scheduler + places related tasks on same numa node when the node has spare + resource. + + If in doubt, say N. + config UCLAMP_TASK_GROUP bool "Utilization clamping per group of tasks" depends on CGROUP_SCHED diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 879c22e63c6c..ff9ff2c17f79 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -41,3 +41,4 @@ obj-$(CONFIG_BPF_SCHED) += bpf_sched.o obj-$(CONFIG_BPF_SCHED) += bpf_topology.o obj-$(CONFIG_QOS_SCHED_SMART_GRID) += grid/ obj-$(CONFIG_SCHED_TASK_RELATIONSHIP) += relationship.o relationship_ioctl.o +obj-$(CONFIG_QOS_SCHED_NUMA_ICON) += numa_icon.o diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index ac1b94ea6740..3e14d1fa911e 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -346,6 +346,31 @@ static const struct bpf_func_proto bpf_cpus_share_cache_proto = { .arg2_type = ARG_ANYTHING, };
+#ifdef CONFIG_QOS_SCHED_NUMA_ICON +BPF_CALL_3(bpf_get_node_stats, int, nid, + struct bpf_node_stats *, ctx, + int, len) +{ + if (len != sizeof(*ctx)) + return -EINVAL; + + if ((unsigned int)nid >= nr_node_ids) + return -EINVAL; + + sched_get_node_load(nid, ctx); + return 0; +} + +const struct bpf_func_proto bpf_get_node_stats_proto = { + .func = bpf_get_node_stats, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; +#endif + #ifdef CONFIG_SCHED_TASK_RELATIONSHIP BPF_CALL_3(bpf_get_task_relationship_stats, struct task_struct *, tsk, struct bpf_map *, map, struct bpf_relationship_get_args *, args) @@ -413,6 +438,10 @@ bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_cpus_share_cache_proto; case BPF_FUNC_nodemask_op: return &bpf_nodemask_op_proto; +#ifdef CONFIG_QOS_SCHED_NUMA_ICON + case BPF_FUNC_get_node_stats: + return &bpf_get_node_stats_proto; +#endif #ifdef CONFIG_SCHED_TASK_RELATIONSHIP case BPF_FUNC_get_task_relationship_stats: return &bpf_get_task_relationship_stats_proto; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c64055874a73..404358af80c7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3954,6 +3954,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
cfs_rq_util_change(cfs_rq, 0);
+ numa_load_change(cfs_rq); + trace_pelt_cfs_tp(cfs_rq); }
@@ -3984,6 +3986,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
cfs_rq_util_change(cfs_rq, 0);
+ numa_load_change(cfs_rq); + trace_pelt_cfs_tp(cfs_rq); }
@@ -4024,6 +4028,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
} else if (decayed) { cfs_rq_util_change(cfs_rq, 0); + numa_load_change(cfs_rq);
if (flags & UPDATE_TG) update_tg_load_avg(cfs_rq); @@ -13286,6 +13291,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_core(rq, curr);
task_tick_relationship(rq, curr); + + update_numa_capacity(rq); }
/* @@ -13868,6 +13875,7 @@ void sched_show_relationship(struct task_struct *p, struct seq_file *m) #ifdef CONFIG_SCHED_TASK_RELATIONSHIP struct net_group *net_grp; struct numa_group *ng; + int node;
if (!task_relationship_used()) return; @@ -13889,6 +13897,10 @@ void sched_show_relationship(struct task_struct *p, struct seq_file *m) }
rcu_read_unlock(); + + for_each_online_node(node) { + print_node_load_info(m, node); + } #endif } #endif /* CONFIG_SCHED_DEBUG */ @@ -13959,6 +13971,8 @@ __init void init_sched_fair_class(void) INIT_LIST_HEAD(&per_cpu(qos_throttled_cfs_rq, i)); #endif
+ init_sched_numa_icon(); + #ifdef CONFIG_SMP open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
diff --git a/kernel/sched/numa_icon.c b/kernel/sched/numa_icon.c new file mode 100644 index 000000000000..e9825ac7f866 --- /dev/null +++ b/kernel/sched/numa_icon.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Common code for task numa isolation consolidation + * + * Copyright (C) 2023-2024 Huawei Technologies Co., Ltd + * + * Author: Hui Tang tanghui20@huawei.com + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#include "sched.h" + +static bool __sched_numa_icon_switch __initdata; +DEFINE_STATIC_KEY_FALSE(sched_numa_icon_switch); + +struct node_load_info *node_load_ptr; + +static void set_numa_icon_switch(bool enabled) +{ + if (enabled) { + static_branch_enable(&sched_numa_icon_switch); + task_relationship_enable(); + } else { + static_branch_disable(&sched_numa_icon_switch); + task_relationship_disable(); + } +} + +static int __init numa_icon_switch_setup(char *str) +{ + int ret = 0; + + if (!str) + goto out; + + /* + * This code is called before jump labels have been set up, so we can't + * change the static branch directly just yet. Instead set a temporary + * variable so init_numa_icon_switch() can do it later. + */ + if (!strcmp(str, "enable")) { + __sched_numa_icon_switch = true; + ret = 1; + } else if (!strcmp(str, "disable")) { + __sched_numa_icon_switch = false; + ret = 1; + } +out: + if (!ret) + pr_warn("Unable to parse numa_icon=\n"); + + return ret; +} +__setup("numa_icon=", numa_icon_switch_setup); + +__init void init_sched_numa_icon(void) +{ + int i; + + set_numa_icon_switch(__sched_numa_icon_switch); + + if (!sched_numa_icon_enabled()) + return; + + node_load_ptr = kcalloc(nr_node_ids, sizeof(struct node_load_info), + GFP_KERNEL); + + for (i = 0; i < nr_node_ids; i++) { + raw_spin_lock_init(&node_load_ptr[i].lock); + node_load_ptr[i].util_avg_last = + kcalloc(nr_cpu_ids, sizeof(struct sched_avg), GFP_KERNEL); + } + + for_each_possible_cpu(i) { + node_load_ptr[cpu_to_node(i)].compute_capacity += + SCHED_CAPACITY_SCALE; + } +} + +void print_node_load_info(struct seq_file *m, int node) +{ + if (!sched_numa_icon_enabled()) + return; + + seq_printf(m, "node %d capacity=%lu util_avg=%lu\n", node, + node_load_ptr[node].compute_capacity, + atomic_long_read(&node_load_ptr[node].util_avg)); +} + +void numa_load_change(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); + int nid = cpu_to_node(cpu); + struct sched_avg *avg_old; + long delta; + + if (!sched_numa_icon_enabled()) + return; + + avg_old = &node_load_ptr[nid].util_avg_last[cpu]; + + if (&rq->cfs != cfs_rq) + return; + + delta = cfs_rq->avg.util_avg - avg_old->util_avg; + atomic_long_add(delta, &node_load_ptr[nid].util_avg); + avg_old->util_avg = cfs_rq->avg.util_avg; +} + +void update_numa_capacity(struct rq *rq) +{ + int cpu = cpu_of(rq); + int nid = cpu_to_node(cpu); + unsigned long capacity = 0; + + if (!sched_numa_icon_enabled()) + return; + + if (cpu != cpumask_first(cpumask_of_node(nid))) + return; + + for_each_cpu(cpu, cpumask_of_node(nid)) { + capacity += cpu_rq(cpu)->cpu_capacity; + } + node_load_ptr[nid].compute_capacity = capacity; +} + +#ifdef CONFIG_BPF_SCHED +void sched_get_node_load(int nid, struct bpf_node_stats *ctx) +{ + ctx->util = atomic_long_read(&node_load_ptr[nid].util_avg); + ctx->compute_capacity = node_load_ptr[nid].compute_capacity; + ctx->weight = cpumask_weight(cpumask_of_node(nid)); +} +#endif diff --git a/kernel/sched/numa_icon.h b/kernel/sched/numa_icon.h new file mode 100644 index 000000000000..adeed53e9f14 --- /dev/null +++ b/kernel/sched/numa_icon.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_NUMA_ICON_H +#include <linux/sched.h> + +struct node_load_info { + raw_spinlock_t lock ____cacheline_aligned; + atomic_long_t util_avg; + unsigned long compute_capacity; + struct sched_avg *util_avg_last; +}; + +#ifdef CONFIG_QOS_SCHED_NUMA_ICON +extern struct static_key_false sched_numa_icon_switch; +static __always_inline bool sched_numa_icon_enabled(void) +{ + return static_branch_unlikely(&sched_numa_icon_switch); +} + +extern void print_node_load_info(struct seq_file *m, int node); +extern __init void init_sched_numa_icon(void); +extern void sched_get_node_load(int nid, struct bpf_node_stats *ctx); +extern void init_node_load(struct rq *rq); +extern void numa_load_change(struct cfs_rq *cfs_rq); +extern void update_numa_capacity(struct rq *rq); + +#else /* !CONFIG_QOS_SCHED_NUMA_ICON */ +static inline void init_sched_numa_icon(void) {} + +static inline void init_node_load(struct rq *rq) {} + +static inline void numa_load_change(struct cfs_rq *cfs_rq) {} + +static inline void update_numa_capacity(struct rq *rq) {} + +static inline void print_node_load_info(struct seq_file *m, int node) {} + +static __always_inline bool sched_numa_icon_enabled(void) +{ + return false; +} +#endif /* CONFIG_QOS_SCHED_NUMA_ICON */ + +#endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e6f934af7062..3b2fc472908a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -76,6 +76,8 @@
#include "cpupri.h" #include "cpudeadline.h" +#include "numa_icon.h" +#include <uapi/linux/sched_ctrl.h>
#include <trace/events/sched.h>
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 4f6fac621f65..3afc3e354844 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -449,6 +449,7 @@ class PrinterHelpers(Printer): 'struct bpf_relationship_get_args', 'struct bpf_relationship_set_args', 'struct sched_preferred_node_ctx', + 'struct bpf_node_stats', ] known_types = { '...', @@ -506,6 +507,7 @@ class PrinterHelpers(Printer): 'struct bpf_relationship_get_args', 'struct bpf_relationship_set_args', 'struct sched_preferred_node_ctx', + 'struct bpf_node_stats', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 5c04747f201c..5a153a1a8f18 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3914,6 +3914,12 @@ union bpf_attr { * set current task preferred node. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_get_node_stats(int nid, struct bpf_node_stats *ctx, int len) + * Description + * get resource statistics of *nid* and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4089,6 +4095,7 @@ union bpf_attr { FN(nodemask_op), \ FN(get_task_relationship_stats),\ FN(sched_set_curr_preferred_node),\ + FN(get_node_stats), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/tools/lib/bpf/libbpf_sched.h b/tools/lib/bpf/libbpf_sched.h index 04af73b92856..3e9b41788637 100644 --- a/tools/lib/bpf/libbpf_sched.h +++ b/tools/lib/bpf/libbpf_sched.h @@ -623,6 +623,30 @@ static __always_inline int libbpf_sched_se_tag_of(struct sched_entity *se) return se_tag; }
+static __always_inline unsigned long libbpf_node_cfs_util_of(int nid) +{ + struct bpf_node_stats stats = {0}; + + bpf_get_node_stats(nid, &stats, sizeof(stats)); + return getVal(stats.util); +} + +static __always_inline unsigned long libbpf_node_cfs_capacity_of(int nid) +{ + struct bpf_node_stats stats = {0}; + + bpf_get_node_stats(nid, &stats, sizeof(stats)); + return getVal(stats.compute_capacity); +} + +static __always_inline unsigned int libbpf_node_weight_of(int nid) +{ + struct bpf_node_stats stats = {0}; + + bpf_get_node_stats(nid, &stats, sizeof(stats)); + return getVal(stats.weight); +} + static __always_inline int libbpf_mem_preferred_nid(struct task_struct *tsk, nodemask_t *preferred_node) {