add ebpf sched
Cheng Yu (1): sched/ebpf: Support task selection programmable
Hui Tang (1): sched: Add kfunc to get cpu statistics
Roman Gushchin (2): libbpf: add support for scheduler bpf programs bpftool: recognize scheduler programs
include/linux/sched.h | 20 ++++ include/linux/sched_hook_defs.h | 4 + kernel/bpf/btf.c | 3 + kernel/sched/bpf_sched.c | 190 ++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 14 +++ tools/lib/bpf/libbpf.c | 23 +++- tools/lib/bpf/libbpf.h | 2 + tools/lib/bpf/libbpf.map | 1 + 8 files changed, 256 insertions(+), 1 deletion(-)
From: Roman Gushchin guro@fb.com
maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5F6X6 CVE: NA
Reference: https://lore.kernel.org/all/20210916162451.709260-1-guro@fb.com/
-------------------
This patch adds a support for loading and attaching scheduler bpf programs.
Fixes: 82c25c3e9414 ("sched: basic infrastructure for scheduler bpf") Signed-off-by: Roman Gushchin guro@fb.com Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Guan Jing guanjing6@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com --- tools/lib/bpf/libbpf.c | 21 ++++++++++++++++++++- tools/lib/bpf/libbpf.h | 2 ++ tools/lib/bpf/libbpf.map | 1 + 3 files changed, 23 insertions(+), 1 deletion(-)
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index ceed16a10285..6cfc963f2322 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -3030,7 +3030,8 @@ static int bpf_object_fixup_btf(struct bpf_object *obj) static bool prog_needs_vmlinux_btf(struct bpf_program *prog) { if (prog->type == BPF_PROG_TYPE_STRUCT_OPS || - prog->type == BPF_PROG_TYPE_LSM) + prog->type == BPF_PROG_TYPE_LSM || + prog->type == BPF_PROG_TYPE_SCHED) return true;
/* BPF_PROG_TYPE_TRACING programs which do not attach to other programs @@ -8770,6 +8771,7 @@ static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, stru static int attach_uprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_sched(const struct bpf_program *prog, long cookie, struct bpf_link **link);
static const struct bpf_sec_def section_defs[] = { SEC_DEF("socket", SOCKET_FILTER, 0, SEC_NONE), @@ -8864,6 +8866,7 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("struct_ops.s+", STRUCT_OPS, 0, SEC_SLEEPABLE), SEC_DEF("sk_lookup", SK_LOOKUP, BPF_SK_LOOKUP, SEC_ATTACHABLE), SEC_DEF("netfilter", NETFILTER, BPF_NETFILTER, SEC_NONE), + SEC_DEF("sched/", SCHED, BPF_SCHED, SEC_ATTACH_BTF, attach_sched), };
int libbpf_register_prog_handler(const char *sec, @@ -9243,6 +9246,7 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, #define BTF_TRACE_PREFIX "btf_trace_" #define BTF_LSM_PREFIX "bpf_lsm_" #define BTF_ITER_PREFIX "bpf_iter_" +#define BTF_SCHED_PREFIX "bpf_sched_" #define BTF_MAX_NAME_SIZE 128
void btf_get_kernel_prefix_kind(enum bpf_attach_type attach_type, @@ -9262,6 +9266,10 @@ void btf_get_kernel_prefix_kind(enum bpf_attach_type attach_type, *prefix = BTF_ITER_PREFIX; *kind = BTF_KIND_FUNC; break; + case BPF_SCHED: + *prefix = BTF_SCHED_PREFIX; + *kind = BTF_KIND_FUNC; + break; default: *prefix = ""; *kind = BTF_KIND_FUNC; @@ -12119,6 +12127,17 @@ struct bpf_link *bpf_program__attach_netfilter(const struct bpf_program *prog, return link; }
+struct bpf_link *bpf_program__attach_sched(const struct bpf_program *prog) +{ + return bpf_program__attach_btf_id(prog, NULL); +} + +static int attach_sched(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + *link = bpf_program__attach_sched(prog); + return libbpf_get_error(*link); +} + struct bpf_link *bpf_program__attach(const struct bpf_program *prog) { struct bpf_link *link = NULL; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 0e52621cba43..aabdd973c1a5 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -769,6 +769,8 @@ bpf_program__attach_xdp(const struct bpf_program *prog, int ifindex); LIBBPF_API struct bpf_link * bpf_program__attach_freplace(const struct bpf_program *prog, int target_fd, const char *attach_func_name); +LIBBPF_API struct bpf_link * +bpf_program__attach_sched(const struct bpf_program *prog);
struct bpf_netfilter_opts { /* size of this struct, for forward/backward compatibility */ diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 57712321490f..228ab00a5e69 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -236,6 +236,7 @@ LIBBPF_0.2.0 { perf_buffer__buffer_fd; perf_buffer__epoll_fd; perf_buffer__consume_buffer; + bpf_program__attach_sched; } LIBBPF_0.1.0;
LIBBPF_0.3.0 {
From: Roman Gushchin guro@fb.com
maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5F6X6 CVE: NA
Reference: https://lore.kernel.org/all/20210916162451.709260-1-guro@fb.com/
-------------------
Teach bpftool to recognize scheduler bpf programs.
Fixes: 82c25c3e9414 ("sched: basic infrastructure for scheduler bpf") Signed-off-by: Roman Gushchin guro@fb.com Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Guan Jing guanjing6@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com --- tools/lib/bpf/libbpf.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 6cfc963f2322..4364c0c9b100 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -122,6 +122,7 @@ static const char * const attach_type_name[] = { [BPF_TCX_INGRESS] = "tcx_ingress", [BPF_TCX_EGRESS] = "tcx_egress", [BPF_TRACE_UPROBE_MULTI] = "trace_uprobe_multi", + [BPF_SCHED] = "sched", };
static const char * const link_type_name[] = { @@ -210,6 +211,7 @@ static const char * const prog_type_name[] = { [BPF_PROG_TYPE_SK_LOOKUP] = "sk_lookup", [BPF_PROG_TYPE_SYSCALL] = "syscall", [BPF_PROG_TYPE_NETFILTER] = "netfilter", + [BPF_PROG_TYPE_SCHED] = "sched", };
static int __base_pr(enum libbpf_print_level level, const char *format,
From: Hui Tang tanghui20@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8OIT1 CVE: NA
--------------------------------
Support kfunc for BPF_PROG_TYPE_SCHED.
And add the helper functions to get cpu statistics, as follows: 1.acquire cfs/rt/irq cpu load statitic. 2.acquire multiple types of nr_running statitic. 3.acquire cpu idle statitic. 4.acquire cpu capacity.
Based on CPU statistics in different dimensions, specific scheduling policies can be implemented in bpf program.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com --- include/linux/sched.h | 20 ++++++ kernel/bpf/btf.c | 3 + kernel/sched/bpf_sched.c | 140 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 163 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h index b8be76b0c120..d65ec3179225 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2557,6 +2557,26 @@ struct sched_migrate_node { int dst_cpu; int dst_node; }; + +struct bpf_sched_cpu_stats { + refcount_t usage; + unsigned int nr_running; + unsigned int cfs_nr_running; + unsigned int cfs_h_nr_running; + unsigned int cfs_idle_h_nr_running; + unsigned int rt_nr_running; + unsigned int rr_nr_running; + + KABI_RESERVE(1) + KABI_RESERVE(2) + KABI_RESERVE(3) + KABI_RESERVE(4) + KABI_RESERVE(5) + KABI_RESERVE(6) + KABI_RESERVE(7) + KABI_RESERVE(8) +}; + #endif
#ifdef CONFIG_SCHED_CORE diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index e197476f3c28..7a82194ad51d 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -217,6 +217,7 @@ enum btf_kfunc_hook { BTF_KFUNC_HOOK_SOCKET_FILTER, BTF_KFUNC_HOOK_LWT, BTF_KFUNC_HOOK_NETFILTER, + BTF_KFUNC_HOOK_SCHED, BTF_KFUNC_HOOK_MAX, };
@@ -7864,6 +7865,8 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) return BTF_KFUNC_HOOK_LWT; case BPF_PROG_TYPE_NETFILTER: return BTF_KFUNC_HOOK_NETFILTER; + case BPF_PROG_TYPE_SCHED: + return BTF_KFUNC_HOOK_SCHED; default: return BTF_KFUNC_HOOK_MAX; } diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index e2525bd60abf..7eb3be52110f 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bpf.h> +#include <linux/bpf_mem_alloc.h> #include <linux/cgroup.h> #include <linux/bpf_verifier.h> #include <linux/bpf_sched.h> @@ -62,3 +63,142 @@ const struct bpf_verifier_ops bpf_sched_verifier_ops = { .get_func_proto = bpf_sched_func_proto, .is_valid_access = btf_ctx_access, }; + +static struct bpf_mem_alloc bpf_cpustats_ma; + +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "Global functions as their definitions will be in BTF"); + +/** + * bpf_sched_cpu_stats_create() - Create a mutable BPF cpustats context. + * + * Allocates a cpustats context that can be queried, mutated, acquired, and + * released by a BPF program. The cpustats context returned by this function + * must either be embedded in a map as a kptr, or freed with + * bpf_sched_cpu_stats_release(). + * + * bpf_sched_cpu_stats_create() allocates memory using the BPF memory allocator, + * and will not block. It may return NULL if no memory is available. + */ +__bpf_kfunc struct bpf_sched_cpu_stats *bpf_sched_cpustats_create(void) +{ + struct bpf_sched_cpu_stats *cpustats; + + cpustats = bpf_mem_cache_alloc(&bpf_cpustats_ma); + if (!cpustats) + return NULL; + + memset(cpustats, 0, sizeof(*cpustats)); + refcount_set(&cpustats->usage, 1); + + return cpustats; +} + +/** + * bpf_sched_cpu_stats_acquire() - Acquire a reference to a BPF cpustats. + * @cpustats: The BPF cpustats being acquired. The cpustats must be a trusted + * pointer. + * + * Acquires a reference to a BPF cpustats. The cpustats returned by this + * function must either be embedded in a map as a kptr, or freed with + * bpf_sched_cpu_stats_release(). + */ +__bpf_kfunc struct bpf_sched_cpu_stats *bpf_sched_cpustats_acquire( + struct bpf_sched_cpu_stats *cpustats) +{ + refcount_inc(&cpustats->usage); + return cpustats; +} + +/** + * bpf_sched_cpustats_release() - Release a previously acquired BPF cpustats. + * @cpustats: The cpustats being released. + * + * Releases a previously acquired reference to a BPF cpustats. When the final + * reference of the BPF cpustats has been released, it is subsequently freed in + * an RCU callback in the BPF memory allocator. + */ +__bpf_kfunc void +bpf_sched_cpustats_release(struct bpf_sched_cpu_stats *cpustats) +{ + if (!refcount_dec_and_test(&cpustats->usage)) + return; + + migrate_disable(); + bpf_mem_cache_free_rcu(&bpf_cpustats_ma, cpustats); + migrate_enable(); +} + +/** + * bpf_sched_cpu_stats_of() - Acquire cpu sched statistics. + * @cpuid: CPU ID, input. + * @ctx: The cpu statistics is being stored. + * + * Return: + * 0 - Success. + * <0 - Fail. + */ +__bpf_kfunc s32 bpf_sched_cpu_stats_of(int cpuid, + struct bpf_sched_cpu_stats *ctx) +{ + struct rq *rq; + int cpu = cpuid; + + if (!ctx) + return -EINVAL; + + if ((unsigned int)cpu >= nr_cpu_ids) + return -EINVAL; + + rq = cpu_rq(cpu); + memset(ctx, 0, sizeof(*ctx)); + + SCHED_WARN_ON(!rcu_read_lock_held()); + /* nr_running */ + ctx->nr_running = rq->nr_running; + ctx->cfs_nr_running = rq->cfs.nr_running; + ctx->cfs_h_nr_running = rq->cfs.h_nr_running; + ctx->cfs_idle_h_nr_running = rq->cfs.idle_h_nr_running; + ctx->rt_nr_running = rq->rt.rt_nr_running; + ctx->rr_nr_running = rq->rt.rr_nr_running; + + return 0; +} + +__diag_pop(); + +BTF_SET8_START(sched_cpustats_kfunc_btf_ids) +BTF_ID_FLAGS(func, bpf_sched_cpustats_create, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_sched_cpustats_release, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_sched_cpustats_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_sched_cpu_stats_of, KF_RCU) +BTF_SET8_END(sched_cpustats_kfunc_btf_ids) + +static const struct btf_kfunc_id_set cpustats_kfunc_set = { + .owner = THIS_MODULE, + .set = &sched_cpustats_kfunc_btf_ids, +}; + +BTF_ID_LIST(cpustats_dtor_ids) +BTF_ID(struct, bpf_sched_cpu_stats) +BTF_ID(func, bpf_sched_cpustats_release) + +static int __init bpf_kfunc_init(void) +{ + int ret; + const struct btf_id_dtor_kfunc cpustats_dtors[] = { + { + .btf_id = cpustats_dtor_ids[0], + .kfunc_btf_id = cpustats_dtor_ids[1] + }, + }; + + ret = bpf_mem_alloc_init(&bpf_cpustats_ma, sizeof(struct bpf_sched_cpu_stats), false); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &cpustats_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED, &cpustats_kfunc_set); + return ret ?: register_btf_id_dtor_kfuncs(cpustats_dtors, + ARRAY_SIZE(cpustats_dtors), + THIS_MODULE); +} +late_initcall(bpf_kfunc_init);
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IB6NEH CVE: NA
--------------------------------
Pre-embed hooks in the pick_next_task_fair() so that users can customize the strategy for selecting tasks and provide some useful kfuncs.
Signed-off-by: Cheng Yu serein.chengyu@huawei.com --- include/linux/sched_hook_defs.h | 4 +++ kernel/sched/bpf_sched.c | 56 +++++++++++++++++++++++++++++++-- kernel/sched/fair.c | 14 +++++++++ 3 files changed, 71 insertions(+), 3 deletions(-)
diff --git a/include/linux/sched_hook_defs.h b/include/linux/sched_hook_defs.h index c43297cc6049..57b7f61d515c 100644 --- a/include/linux/sched_hook_defs.h +++ b/include/linux/sched_hook_defs.h @@ -2,3 +2,7 @@ BPF_SCHED_HOOK(int, -1, cfs_select_rq, struct sched_migrate_ctx *ctx) BPF_SCHED_HOOK(int, -1, cfs_can_migrate_task, struct task_struct *p, struct sched_migrate_node *migrate_node) +BPF_SCHED_HOOK(int, -1, cfs_tag_entity_eligible, struct sched_entity *se) +BPF_SCHED_HOOK(int, -1, cfs_tag_pick_next_entity, + const struct sched_entity *curr, + const struct sched_entity *next) diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 7eb3be52110f..2958b3029d38 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -184,6 +184,46 @@ BTF_ID_LIST(cpustats_dtor_ids) BTF_ID(struct, bpf_sched_cpu_stats) BTF_ID(func, bpf_sched_cpustats_release)
+__bpf_kfunc int bpf_sched_entity_is_task(struct sched_entity *se) +{ + if (!se) + return -EINVAL; + + return entity_is_task(se); +} + +__bpf_kfunc struct task_struct *bpf_sched_entity_to_task(struct sched_entity *se) +{ + if (se && entity_is_task(se)) + return task_of(se); + + return NULL; +} + +__bpf_kfunc long bpf_sched_tag_of_entity(struct sched_entity *se) +{ + if (!se) + return -EINVAL; + + if (entity_is_task(se)) + return task_of(se)->tag; + + return group_cfs_rq(se)->tg->tag; +} + +BTF_SET8_START(sched_entity_kfunc_btf_ids) +BTF_ID_FLAGS(func, bpf_sched_entity_is_task) +BTF_ID_FLAGS(func, bpf_sched_entity_to_task) +BTF_ID_FLAGS(func, bpf_sched_tag_of_entity) +BTF_SET8_END(sched_entity_kfunc_btf_ids) + +static const struct btf_kfunc_id_set sched_entity_kfunc_set = { + .owner = THIS_MODULE, + .set = &sched_entity_kfunc_btf_ids, +}; + +BTF_ID_LIST(sched_entity_dtor_ids) + static int __init bpf_kfunc_init(void) { int ret; @@ -193,12 +233,22 @@ static int __init bpf_kfunc_init(void) .kfunc_btf_id = cpustats_dtor_ids[1] }, }; + const struct btf_id_dtor_kfunc sched_entity_dtors[] = { + { + .btf_id = sched_entity_dtor_ids[0], + .kfunc_btf_id = sched_entity_dtor_ids[1] + }, + };
ret = bpf_mem_alloc_init(&bpf_cpustats_ma, sizeof(struct bpf_sched_cpu_stats), false); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &cpustats_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED, &cpustats_kfunc_set); - return ret ?: register_btf_id_dtor_kfuncs(cpustats_dtors, - ARRAY_SIZE(cpustats_dtors), - THIS_MODULE); + ret = ret ?: register_btf_id_dtor_kfuncs(cpustats_dtors, + ARRAY_SIZE(cpustats_dtors), + THIS_MODULE); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED, &sched_entity_kfunc_set); + return ret ?: register_btf_id_dtor_kfuncs(sched_entity_dtors, + ARRAY_SIZE(sched_entity_dtors), + THIS_MODULE); } late_initcall(bpf_kfunc_init); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f1cd57e70f1f..2ef32e806f54 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -708,6 +708,13 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) static inline bool entity_before(const struct sched_entity *a, const struct sched_entity *b) { +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + if (bpf_sched_cfs_tag_pick_next_entity(a, b) == 1) + return true; + } +#endif + /* * Tiebreak on vruntime seems unnecessary since it can * hardly happen. @@ -905,6 +912,13 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) { +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + if (bpf_sched_cfs_tag_entity_eligible(se) == 1) + return 1; + } +#endif + return vruntime_eligible(cfs_rq, se->vruntime); }
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/13805 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/4...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/13805 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/4...