From: Roman Gushchin guro@fb.com
maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5F6X6 CVE: NA
Reference: https://lore.kernel.org/all/20210916162451.709260-1-guro@fb.com/
-------------------
This commit introduces basic definitions and infrastructure for scheduler bpf programs. It defines the BPF_PROG_TYPE_SCHED program type and the BPF_SCHED attachment type.
The implementation is inspired by lsm bpf programs and is based on kretprobes. This will allow to add new hooks with a minimal changes to the kernel code and without any changes to libbpf/bpftool. It's very convenient as I anticipate a large number of private patches being used for a long time before (or if at all) reaching upstream.
Sched programs are expected to return an int, which meaning will be context defined.
This patch doesn't add any real scheduler hooks (only a stub), it will be done by following patches in the series.
Scheduler bpf programs as now are very restricted in what they can do: only the bpf_printk() helper is available. The scheduler context can impose significant restrictions on what's safe and what's not. So let's extend their abilities on case by case basis when a need arise.
Signed-off-by: Roman Gushchin guro@fb.com Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com --- include/linux/bpf_sched.h | 26 ++++++++++++++ include/linux/bpf_types.h | 4 +++ include/linux/sched_hook_defs.h | 2 ++ include/uapi/linux/bpf.h | 2 ++ init/Kconfig | 10 ++++++ kernel/bpf/btf.c | 1 + kernel/bpf/syscall.c | 9 +++++ kernel/bpf/trampoline.c | 1 + kernel/bpf/verifier.c | 11 +++++- kernel/sched/Makefile | 1 + kernel/sched/bpf_sched.c | 62 +++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 2 ++ 12 files changed, 130 insertions(+), 1 deletion(-) create mode 100644 include/linux/bpf_sched.h create mode 100644 include/linux/sched_hook_defs.h create mode 100644 kernel/sched/bpf_sched.c
diff --git a/include/linux/bpf_sched.h b/include/linux/bpf_sched.h new file mode 100644 index 000000000000..874393e6a6aa --- /dev/null +++ b/include/linux/bpf_sched.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_BPF_SCHED_H +#define _LINUX_BPF_SCHED_H + +#include <linux/bpf.h> + +#ifdef CONFIG_BPF_SCHED + +#define BPF_SCHED_HOOK(RET, DEFAULT, NAME, ...) \ + RET bpf_sched_##NAME(__VA_ARGS__); +#include <linux/sched_hook_defs.h> +#undef BPF_SCHED_HOOK + +int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, + const struct bpf_prog *prog); + +#else /* !CONFIG_BPF_SCHED */ + +static inline int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, + const struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} + +#endif /* CONFIG_BPF_SCHED */ +#endif /* _LINUX_BPF_SCHED_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index a8137bb6dd3c..5732b485c539 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -77,6 +77,10 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LSM, lsm, void *, void *) #endif /* CONFIG_BPF_LSM */ #endif +#ifdef CONFIG_BPF_SCHED +BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED, bpf_sched, + void *, void *) +#endif /* CONFIG_BPF_SCHED */
BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/linux/sched_hook_defs.h b/include/linux/sched_hook_defs.h new file mode 100644 index 000000000000..14344004e335 --- /dev/null +++ b/include/linux/sched_hook_defs.h @@ -0,0 +1,2 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +BPF_SCHED_HOOK(int, 0, dummy, void) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8fae845d80e2..4f8f3f2113a7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -199,6 +199,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_EXT, BPF_PROG_TYPE_LSM, BPF_PROG_TYPE_SK_LOOKUP, + BPF_PROG_TYPE_SCHED, };
enum bpf_attach_type { @@ -240,6 +241,7 @@ enum bpf_attach_type { BPF_XDP_CPUMAP, BPF_SK_LOOKUP, BPF_XDP, + BPF_SCHED, __MAX_BPF_ATTACH_TYPE };
diff --git a/init/Kconfig b/init/Kconfig index 27c5ed16fef1..c3fa3240d87d 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1749,6 +1749,16 @@ config BPF_LSM
If you are unsure how to answer this question, answer N.
+config BPF_SCHED + bool "SCHED Instrumentation with BPF" + depends on BPF_EVENTS + depends on BPF_SYSCALL + help + Enables instrumentation of the sched hooks with eBPF programs for + implementing dynamic scheduling Policies. + + If you are unsure how to answer this question, answer N. + config BPF_SYSCALL bool "Enable bpf() system call" select BPF diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index fba28f17e61a..9a0a9895ec62 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4479,6 +4479,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return true; t = btf_type_by_id(btf, t->type); break; + case BPF_SCHED: case BPF_MODIFY_RETURN: /* For now the BPF_MODIFY_RETURN can only be attached to * functions that return an int. diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 419dbc3d060e..ff65862ae5ce 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1997,6 +1997,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_STRUCT_OPS: case BPF_PROG_TYPE_EXT: + case BPF_PROG_TYPE_SCHED: break; default: return -EINVAL; @@ -2108,6 +2109,7 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */ case BPF_PROG_TYPE_EXT: /* extends any prog */ + case BPF_PROG_TYPE_SCHED: return true; default: return false; @@ -2608,6 +2610,12 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, goto out_put_prog; } break; + case BPF_PROG_TYPE_SCHED: + if (prog->expected_attach_type != BPF_SCHED) { + err = -EINVAL; + goto out_put_prog; + } + break; default: err = -EINVAL; goto out_put_prog; @@ -2838,6 +2846,7 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_EXT: case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_SCHED: if (attr->raw_tracepoint.name) { /* The attach point for this category of programs * should be specified via btf_id during program load. diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 986dabc3d11f..cc6ba35a1d14 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -357,6 +357,7 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) switch (prog->expected_attach_type) { case BPF_TRACE_FENTRY: return BPF_TRAMP_FENTRY; + case BPF_SCHED: case BPF_MODIFY_RETURN: return BPF_TRAMP_MODIFY_RETURN; case BPF_TRACE_FEXIT: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6423f1714a2f..d26104b258ba 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -22,6 +22,7 @@ #include <linux/error-injection.h> #include <linux/bpf_lsm.h> #include <linux/btf_ids.h> +#include <linux/bpf_sched.h>
#include "disasm.h"
@@ -12178,6 +12179,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, case BPF_LSM_MAC: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_SCHED: if (!btf_type_is_func(t)) { bpf_log(log, "attach_btf_id %u is not a function\n", btf_id); @@ -12283,7 +12285,8 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
if (prog->type != BPF_PROG_TYPE_TRACING && prog->type != BPF_PROG_TYPE_LSM && - prog->type != BPF_PROG_TYPE_EXT) + prog->type != BPF_PROG_TYPE_EXT && + prog->type != BPF_PROG_TYPE_SCHED) return 0;
ret = bpf_check_attach_target(&env->log, prog, tgt_prog, btf_id, &tgt_info); @@ -12323,6 +12326,12 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return ret; }
+ if (prog->type == BPF_PROG_TYPE_SCHED) { + ret = bpf_sched_verify_prog(&env->log, prog); + if (ret < 0) + return ret; + } + key = bpf_trampoline_compute_key(tgt_prog, btf_id); tr = bpf_trampoline_get(key, &tgt_info); if (!tr) diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 5fc9c9b70862..8ae9e39eb83a 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -36,3 +36,4 @@ obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_CPU_ISOLATION) += isolation.o obj-$(CONFIG_PSI) += psi.o +obj-$(CONFIG_BPF_SCHED) += bpf_sched.o \ No newline at end of file diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c new file mode 100644 index 000000000000..2f05c186cfd0 --- /dev/null +++ b/kernel/sched/bpf_sched.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <linux/cgroup.h> +#include <linux/bpf_verifier.h> +#include <linux/bpf_sched.h> +#include <linux/btf_ids.h> +#include "sched.h" + +/* + * For every hook declare a nop function where a BPF program can be attached. + */ +#define BPF_SCHED_HOOK(RET, DEFAULT, NAME, ...) \ +noinline RET bpf_sched_##NAME(__VA_ARGS__) \ +{ \ + return DEFAULT; \ +} + +#include <linux/sched_hook_defs.h> +#undef BPF_SCHED_HOOK + +#define BPF_SCHED_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_sched_##NAME) +BTF_SET_START(bpf_sched_hooks) +#include <linux/sched_hook_defs.h> +#undef BPF_SCHED_HOOK +BTF_SET_END(bpf_sched_hooks) + +int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, + const struct bpf_prog *prog) +{ + if (!prog->gpl_compatible) { + bpf_log(vlog, + "sched programs must have a GPL compatible license\n"); + return -EINVAL; + } + + if (!btf_id_set_contains(&bpf_sched_hooks, prog->aux->attach_btf_id)) { + bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n", + prog->aux->attach_btf_id, prog->aux->attach_func_name); + return -EINVAL; + } + + return 0; +} + +static const struct bpf_func_proto * +bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_trace_printk: + return bpf_get_trace_printk_proto(); + default: + return NULL; + } +} + +const struct bpf_prog_ops bpf_sched_prog_ops = { +}; + +const struct bpf_verifier_ops bpf_sched_verifier_ops = { + .get_func_proto = bpf_sched_func_proto, + .is_valid_access = btf_ctx_access, +}; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a44cb5155825..7d7c04f698ec 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -199,6 +199,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_EXT, BPF_PROG_TYPE_LSM, BPF_PROG_TYPE_SK_LOOKUP, + BPF_PROG_TYPE_SCHED, };
enum bpf_attach_type { @@ -240,6 +241,7 @@ enum bpf_attach_type { BPF_XDP_CPUMAP, BPF_SK_LOOKUP, BPF_XDP, + BPF_SCHED, __MAX_BPF_ATTACH_TYPE };
From: Chen Hui judy.chenhui@huawei.com
maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5F6X6 CVE: NA
Reference: https://lore.kernel.org/all/20210916162451.709260-1-guro@fb.com/
-------------------
Move the definitions of four functions(task_of, task_cfs_rq, cfs_rq_of, group_cfs_rq) to sched.h, consistent with the latest version.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com --- kernel/sched/fair.c | 46 --------------------------------------- kernel/sched/sched.h | 52 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 46 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 50d457979db6..bbb4730d238d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -278,33 +278,11 @@ const struct sched_class fair_sched_class; */
#ifdef CONFIG_FAIR_GROUP_SCHED -static inline struct task_struct *task_of(struct sched_entity *se) -{ - SCHED_WARN_ON(!entity_is_task(se)); - return container_of(se, struct task_struct, se); -}
/* Walk up scheduling entities hierarchy */ #define for_each_sched_entity(se) \ for (; se; se = se->parent)
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) -{ - return p->se.cfs_rq; -} - -/* runqueue on which this entity is (to be) queued */ -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) -{ - return se->cfs_rq; -} - -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return grp->my_q; -} - static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) { if (!path) @@ -465,33 +443,9 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
#else /* !CONFIG_FAIR_GROUP_SCHED */
-static inline struct task_struct *task_of(struct sched_entity *se) -{ - return container_of(se, struct task_struct, se); -} - #define for_each_sched_entity(se) \ for (; se; se = NULL)
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) -{ - return &task_rq(p)->cfs; -} - -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) -{ - struct task_struct *p = task_of(se); - struct rq *rq = task_rq(p); - - return &rq->cfs; -} - -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return NULL; -} - static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) { if (path) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e41a5207a212..35cc92b5eeea 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1172,6 +1172,58 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues)
+#ifdef CONFIG_FAIR_GROUP_SCHED +static inline struct task_struct *task_of(struct sched_entity *se) +{ + SCHED_WARN_ON(!entity_is_task(se)); + return container_of(se, struct task_struct, se); +} + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return p->se.cfs_rq; +} + +/* runqueue on which this entity is (to be) queued */ +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + return se->cfs_rq; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return grp->my_q; +} + +#else + +static inline struct task_struct *task_of(struct sched_entity *se) +{ + return container_of(se, struct task_struct, se); +} + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return &task_rq(p)->cfs; +} + +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + struct rq *rq = task_rq(p); + + return &rq->cfs; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return NULL; +} +#endif + + extern void update_rq_clock(struct rq *rq);
static inline u64 __rq_clock_broken(struct rq *rq)
From: Roman Gushchin guro@fb.com
maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5F6X6 CVE: NA
Reference: https://lore.kernel.org/all/20210916162451.709260-1-guro@fb.com/
-------------------
This patch adds 3 helpers useful for dealing with sched entities: u64 bpf_sched_entity_to_tgidpid(struct sched_entity *se); u64 bpf_sched_entity_to_cgrpid(struct sched_entity *se); long bpf_sched_entity_belongs_to_cgrp(struct sched_entity *se, u64 cgrpid);
Sched entity is a basic structure used by the scheduler to represent schedulable objects: tasks and cgroups (if CONFIG_FAIR_GROUP_SCHED is enabled). It will be passed as an argument to many bpf hooks, so scheduler bpf programs need a convenient way to deal with it.
bpf_sched_entity_to_tgidpid() and bpf_sched_entity_to_cgrpid() are useful to identify a sched entity in userspace terms (pid, tgid and cgroup id). bpf_sched_entity_belongs_to_cgrp() allows to check whether a sched entity belongs to sub-tree of a cgroup. It allows to write cgroup-specific scheduler policies even without enabling the cgroup cpu controller.
Signed-off-by: Roman Gushchin guro@fb.com Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com --- include/uapi/linux/bpf.h | 23 ++++++++++ kernel/sched/bpf_sched.c | 76 +++++++++++++++++++++++++++++++++- scripts/bpf_helpers_doc.py | 2 + tools/include/uapi/linux/bpf.h | 23 ++++++++++ 4 files changed, 123 insertions(+), 1 deletion(-)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4f8f3f2113a7..499bb05b01fd 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3757,6 +3757,26 @@ union bpf_attr { * Get Ipv4 origdst or replysrc. Works with IPv4. * Return * 0 on success, or a negative error in case of failure. + * + * u64 bpf_sched_entity_to_tgidpid(struct sched_entity *se) + * Description + * Return task's encoded tgid and pid if the sched entity is a task. + * Return + * Tgid and pid encoded as tgid << 32 | pid, if *se* is a task. (u64)-1 otherwise. + * + * u64 bpf_sched_entity_to_cgrpid(struct sched_entity *se) + * Description + * Return cgroup id if the given sched entity is a cgroup. + * Return + * Cgroup id, if *se* is a cgroup. (u64)-1 otherwise. + * + * long bpf_sched_entity_belongs_to_cgrp(struct sched_entity *se, u64 cgrpid) + * Description + * Checks whether the sched entity belongs to a cgroup or + * it's sub-tree. It doesn't require a cgroup CPU controller + * to be enabled. + * Return + * Cgroup id, if *se* is a cgroup. (u64)-1 otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3917,6 +3937,9 @@ union bpf_attr { FN(redirect_peer), \ FN(get_sockops_uid_gid), \ FN(sk_original_addr), \ + FN(sched_entity_to_tgidpid), \ + FN(sched_entity_to_cgrpid), \ + FN(sched_entity_belongs_to_cgrp), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 2f05c186cfd0..831b5917fcda 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -42,14 +42,88 @@ int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, return 0; }
+BPF_CALL_1(bpf_sched_entity_to_tgidpid, struct sched_entity *, se) +{ + if (entity_is_task(se)) { + struct task_struct *task = task_of(se); + + return (u64) task->tgid << 32 | task->pid; + } else { + return (u64) -1; + } +} + +BPF_CALL_1(bpf_sched_entity_to_cgrpid, struct sched_entity *, se) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + if (!entity_is_task(se)) + return cgroup_id(se->my_q->tg->css.cgroup); +#endif + return (u64) -1; +} + +BPF_CALL_2(bpf_sched_entity_belongs_to_cgrp, struct sched_entity *, se, + u64, cgrpid) +{ +#ifdef CONFIG_CGROUPS + struct cgroup *cgrp; + int level; + + if (entity_is_task(se)) + cgrp = task_dfl_cgroup(task_of(se)); +#ifdef CONFIG_FAIR_GROUP_SCHED + else + cgrp = se->my_q->tg->css.cgroup; +#endif + + for (level = cgrp->level; level; level--) + if (cgrp->ancestor_ids[level] == cgrpid) + return 1; +#endif + return 0; +} + +BTF_ID_LIST_SINGLE(btf_sched_entity_ids, struct, sched_entity) + +static const struct bpf_func_proto bpf_sched_entity_to_tgidpid_proto = { + .func = bpf_sched_entity_to_tgidpid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_entity_ids[0], +}; + +static const struct bpf_func_proto bpf_sched_entity_to_cgrpid_proto = { + .func = bpf_sched_entity_to_cgrpid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_entity_ids[0], +}; + +static const struct bpf_func_proto bpf_sched_entity_belongs_to_cgrp_proto = { + .func = bpf_sched_entity_belongs_to_cgrp, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_entity_ids[0], + .arg2_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_trace_printk: return bpf_get_trace_printk_proto(); + case BPF_FUNC_sched_entity_to_tgidpid: + return &bpf_sched_entity_to_tgidpid_proto; + case BPF_FUNC_sched_entity_to_cgrpid: + return &bpf_sched_entity_to_cgrpid_proto; + case BPF_FUNC_sched_entity_belongs_to_cgrp: + return &bpf_sched_entity_belongs_to_cgrp_proto; default: - return NULL; + return bpf_base_func_proto(func_id); } }
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 31484377b8b1..be21512ee7be 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -435,6 +435,7 @@ class PrinterHelpers(Printer): 'struct xdp_md', 'struct path', 'struct btf_ptr', + 'struct sched_entity', ] known_types = { '...', @@ -478,6 +479,7 @@ class PrinterHelpers(Printer): 'struct task_struct', 'struct path', 'struct btf_ptr', + 'struct sched_entity', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7d7c04f698ec..522ace5af965 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3757,6 +3757,26 @@ union bpf_attr { * Get Ipv4 origdst or replysrc. Works with IPv4. * Return * 0 on success, or a negative error in case of failure. + * + * u64 bpf_sched_entity_to_tgidpid(struct sched_entity *se) + * Description + * Return task's encoded tgid and pid if the sched entity is a task. + * Return + * Tgid and pid encoded as tgid << 32 | pid, if *se* is a task. (u64)-1 otherwise. + * + * u64 bpf_sched_entity_to_cgrpid(struct sched_entity *se) + * Description + * Return cgroup id if the given sched entity is a cgroup. + * Return + * Cgroup id, if *se* is a cgroup. (u64)-1 otherwise. + * + * long bpf_sched_entity_belongs_to_cgrp(struct sched_entity *se, u64 cgrpid) + * Description + * Checks whether the sched entity belongs to a cgroup or + * it's sub-tree. It doesn't require a cgroup CPU controller + * to be enabled. + * Return + * Cgroup id, if *se* is a cgroup. (u64)-1 otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3917,6 +3937,9 @@ union bpf_attr { FN(redirect_peer), \ FN(get_sockops_uid_gid), \ FN(sk_original_addr), \ + FN(sched_entity_to_tgidpid), \ + FN(sched_entity_to_cgrpid), \ + FN(sched_entity_belongs_to_cgrp), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
From: Roman Gushchin guro@fb.com
maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5F6X6 CVE: NA
Reference: https://lore.kernel.org/all/20210916162451.709260-1-guro@fb.com/
-------------------
Introduce a dedicated static key and the bpf_sched_enabled() wrapper to guard all invocations of bpf programs in the scheduler code.
It will help to avoid any potential performance regression in a case when no scheduler bpf programs are attached.
Signed-off-by: Roman Gushchin guro@fb.com Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com --- include/linux/bpf_sched.h | 24 ++++++++++++++++++++++++ kernel/bpf/syscall.c | 11 +++++++++++ kernel/sched/bpf_sched.c | 2 ++ 3 files changed, 37 insertions(+)
diff --git a/include/linux/bpf_sched.h b/include/linux/bpf_sched.h index 874393e6a6aa..9cd2493d2787 100644 --- a/include/linux/bpf_sched.h +++ b/include/linux/bpf_sched.h @@ -6,6 +6,8 @@
#ifdef CONFIG_BPF_SCHED
+#include <linux/jump_label.h> + #define BPF_SCHED_HOOK(RET, DEFAULT, NAME, ...) \ RET bpf_sched_##NAME(__VA_ARGS__); #include <linux/sched_hook_defs.h> @@ -14,6 +16,23 @@ int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog);
+DECLARE_STATIC_KEY_FALSE(bpf_sched_enabled_key); + +static inline bool bpf_sched_enabled(void) +{ + return static_branch_unlikely(&bpf_sched_enabled_key); +} + +static inline void bpf_sched_inc(void) +{ + static_branch_inc(&bpf_sched_enabled_key); +} + +static inline void bpf_sched_dec(void) +{ + static_branch_dec(&bpf_sched_enabled_key); +} + #else /* !CONFIG_BPF_SCHED */
static inline int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, @@ -22,5 +41,10 @@ static inline int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, return -EOPNOTSUPP; }
+static inline bool bpf_sched_enabled(void) +{ + return false; +} + #endif /* CONFIG_BPF_SCHED */ #endif /* _LINUX_BPF_SCHED_H */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ff65862ae5ce..2f4091da923f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -31,6 +31,7 @@ #include <linux/poll.h> #include <linux/bpf-netns.h> #include <linux/rcupdate_trace.h> +#include <linux/bpf_sched.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -2531,6 +2532,11 @@ static void bpf_tracing_link_release(struct bpf_link *link) struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link);
+#ifdef CONFIG_BPF_SCHED + if (link->prog->type == BPF_PROG_TYPE_SCHED) + bpf_sched_dec(); +#endif + WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog, tr_link->trampoline));
@@ -2718,6 +2724,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, goto out_unlock; }
+#ifdef CONFIG_BPF_SCHED + if (prog->type == BPF_PROG_TYPE_SCHED) + bpf_sched_inc(); +#endif + link->tgt_prog = tgt_prog; link->trampoline = tr;
diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 831b5917fcda..2ce2afcacb17 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -6,6 +6,8 @@ #include <linux/btf_ids.h> #include "sched.h"
+DEFINE_STATIC_KEY_FALSE(bpf_sched_enabled_key); + /* * For every hook declare a nop function where a BPF program can be attached. */
From: Roman Gushchin guro@fb.com
maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5F6X6 CVE: NA
Reference: https://lore.kernel.org/all/20210916162451.709260-1-guro@fb.com/
-------------------
This patch adds 3 hooks to control wakeup and tick preemption: cfs_check_preempt_tick cfs_check_preempt_wakeup cfs_wakeup_preempt_entity
The first one allows to force or suppress a preemption from a tick context. An obvious usage example is to minimize the number of non-voluntary context switches and decrease an associated latency penalty by (conditionally) providing tasks or task groups an extended execution slice. It can be used instead of tweaking sysctl_sched_min_granularity.
The second one is called from the wakeup preemption code and allows to redefine whether a newly woken task should preempt the execution of the current task. This is useful to minimize a number of preemptions of latency sensitive tasks. To some extent it's a more flexible analog of a sysctl_sched_wakeup_granularity.
The third one is similar, but it tweaks the wakeup_preempt_entity() function, which is called not only from a wakeup context, but also from pick_next_task(), which allows to influence the decision on which task will be running next.
It's a place for a discussion whether we need both these hooks or only one of them: the second is more powerful, but depends more on the current implementation. In any case, bpf hooks are not an ABI, so it's not a deal breaker.
The idea of the wakeup_preempt_entity hook belongs to Rik van Riel. He also contributed a lot to the whole patchset by proving his ideas, recommendations and a feedback for earlier (non-public) versions.
Signed-off-by: Roman Gushchin guro@fb.com Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com --- include/linux/sched_hook_defs.h | 5 ++++- kernel/sched/fair.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-)
diff --git a/include/linux/sched_hook_defs.h b/include/linux/sched_hook_defs.h index 14344004e335..e2f65e4b8895 100644 --- a/include/linux/sched_hook_defs.h +++ b/include/linux/sched_hook_defs.h @@ -1,2 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -BPF_SCHED_HOOK(int, 0, dummy, void) +BPF_SCHED_HOOK(int, 0, cfs_check_preempt_tick, struct sched_entity *curr, unsigned long delta_exec) +BPF_SCHED_HOOK(int, 0, cfs_check_preempt_wakeup, struct task_struct *curr, struct task_struct *p) +BPF_SCHED_HOOK(int, 0, cfs_wakeup_preempt_entity, struct sched_entity *curr, + struct sched_entity *se) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bbb4730d238d..522640cd4aa2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -28,6 +28,7 @@ #include <linux/delay.h> #include <linux/tracehook.h> #endif +#include <linux/bpf_sched.h>
/* * Targeted preemption latency for CPU-bound tasks: @@ -4452,6 +4453,18 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; + +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + int ret = bpf_sched_cfs_check_preempt_tick(curr, delta_exec); + + if (ret < 0) + return; + else if (ret > 0) + resched_curr(rq_of(cfs_rq)); + } +#endif + if (delta_exec > ideal_runtime) { resched_curr(rq_of(cfs_rq)); /* @@ -7021,6 +7034,15 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) { s64 gran, vdiff = curr->vruntime - se->vruntime;
+#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + int ret = bpf_sched_cfs_wakeup_preempt_entity(curr, se); + + if (ret) + return ret; + } +#endif + if (vdiff <= 0) return -1;
@@ -7107,6 +7129,17 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ likely(!task_has_idle_policy(p))) goto preempt;
+#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + int ret = bpf_sched_cfs_check_preempt_wakeup(current, p); + + if (ret < 0) + return; + else if (ret > 0) + goto preempt; + } +#endif + /* * Batch and idle tasks do not preempt non-idle tasks (their preemption * is driven by the tick):
From: Roman Gushchin guro@fb.com
maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5F6X6 CVE: NA
Reference: https://lore.kernel.org/all/20210916162451.709260-1-guro@fb.com/
-------------------
This patch adds a support for loading and attaching scheduler bpf programs.
Signed-off-by: Roman Gushchin guro@fb.com Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com --- tools/lib/bpf/libbpf.c | 30 +++++++++++++++++++++++++++--- tools/lib/bpf/libbpf.h | 4 ++++ tools/lib/bpf/libbpf.map | 3 +++ 3 files changed, 34 insertions(+), 3 deletions(-)
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index c9f5eef6d3d8..223decdb93e2 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -2504,7 +2504,8 @@ static int bpf_object__finalize_btf(struct bpf_object *obj) static inline bool libbpf_prog_needs_vmlinux_btf(struct bpf_program *prog) { if (prog->type == BPF_PROG_TYPE_STRUCT_OPS || - prog->type == BPF_PROG_TYPE_LSM) + prog->type == BPF_PROG_TYPE_LSM || + prog->type == BPF_PROG_TYPE_SCHED) return true;
/* BPF_PROG_TYPE_TRACING programs which do not attach to other programs @@ -6813,7 +6814,8 @@ int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver)
if ((prog->type == BPF_PROG_TYPE_TRACING || prog->type == BPF_PROG_TYPE_LSM || - prog->type == BPF_PROG_TYPE_EXT) && !prog->attach_btf_id) { + prog->type == BPF_PROG_TYPE_EXT || + prog->type == BPF_PROG_TYPE_SCHED) && !prog->attach_btf_id) { btf_id = libbpf_find_attach_btf_id(prog); if (btf_id <= 0) return btf_id; @@ -8238,6 +8240,7 @@ BPF_PROG_TYPE_FNS(tracing, BPF_PROG_TYPE_TRACING); BPF_PROG_TYPE_FNS(struct_ops, BPF_PROG_TYPE_STRUCT_OPS); BPF_PROG_TYPE_FNS(extension, BPF_PROG_TYPE_EXT); BPF_PROG_TYPE_FNS(sk_lookup, BPF_PROG_TYPE_SK_LOOKUP); +BPF_PROG_TYPE_FNS(sched, BPF_PROG_TYPE_SCHED);
enum bpf_attach_type bpf_program__get_expected_attach_type(struct bpf_program *prog) @@ -8302,6 +8305,8 @@ static struct bpf_link *attach_lsm(const struct bpf_sec_def *sec, struct bpf_program *prog); static struct bpf_link *attach_iter(const struct bpf_sec_def *sec, struct bpf_program *prog); +static struct bpf_link *attach_sched(const struct bpf_sec_def *sec, + struct bpf_program *prog);
static const struct bpf_sec_def section_defs[] = { BPF_PROG_SEC("socket", BPF_PROG_TYPE_SOCKET_FILTER), @@ -8370,6 +8375,10 @@ static const struct bpf_sec_def section_defs[] = { .expected_attach_type = BPF_TRACE_ITER, .is_attach_btf = true, .attach_fn = attach_iter), + SEC_DEF("sched/", SCHED, + .is_attach_btf = true, + .expected_attach_type = BPF_SCHED, + .attach_fn = attach_sched), BPF_EAPROG_SEC("xdp_devmap/", BPF_PROG_TYPE_XDP, BPF_XDP_DEVMAP), BPF_EAPROG_SEC("xdp_cpumap/", BPF_PROG_TYPE_XDP, @@ -8453,7 +8462,7 @@ static const struct bpf_sec_def section_defs[] = { #undef BPF_APROG_COMPAT #undef SEC_DEF
-#define MAX_TYPE_NAME_SIZE 32 +#define MAX_TYPE_NAME_SIZE 31
static const struct bpf_sec_def *find_sec_def(const char *sec_name) { @@ -8657,6 +8666,7 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, #define BTF_TRACE_PREFIX "btf_trace_" #define BTF_LSM_PREFIX "bpf_lsm_" #define BTF_ITER_PREFIX "bpf_iter_" +#define BTF_SCHED_PREFIX "bpf_sched_" #define BTF_MAX_NAME_SIZE 128
static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix, @@ -8690,6 +8700,9 @@ static inline int __find_vmlinux_btf_id(struct btf *btf, const char *name, else if (attach_type == BPF_TRACE_ITER) err = find_btf_by_prefix_kind(btf, BTF_ITER_PREFIX, name, BTF_KIND_FUNC); + else if (attach_type == BPF_SCHED) + err = find_btf_by_prefix_kind(btf, BTF_SCHED_PREFIX, name, + BTF_KIND_FUNC); else err = btf__find_by_name_kind(btf, name, BTF_KIND_FUNC);
@@ -9669,6 +9682,11 @@ struct bpf_link *bpf_program__attach_trace(struct bpf_program *prog) return bpf_program__attach_btf_id(prog); }
+struct bpf_link *bpf_program__attach_sched(struct bpf_program *prog) +{ + return bpf_program__attach_btf_id(prog); +} + struct bpf_link *bpf_program__attach_lsm(struct bpf_program *prog) { return bpf_program__attach_btf_id(prog); @@ -9680,6 +9698,12 @@ static struct bpf_link *attach_trace(const struct bpf_sec_def *sec, return bpf_program__attach_trace(prog); }
+static struct bpf_link *attach_sched(const struct bpf_sec_def *sec, + struct bpf_program *prog) +{ + return bpf_program__attach_sched(prog); +} + static struct bpf_link *attach_lsm(const struct bpf_sec_def *sec, struct bpf_program *prog) { diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 57d10b779dea..a011179f705d 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -264,6 +264,8 @@ bpf_program__attach_xdp(struct bpf_program *prog, int ifindex); LIBBPF_API struct bpf_link * bpf_program__attach_freplace(struct bpf_program *prog, int target_fd, const char *attach_func_name); +LIBBPF_API struct bpf_link * +bpf_program__attach_sched(struct bpf_program *prog);
struct bpf_map;
@@ -360,6 +362,7 @@ LIBBPF_API int bpf_program__set_tracing(struct bpf_program *prog); LIBBPF_API int bpf_program__set_struct_ops(struct bpf_program *prog); LIBBPF_API int bpf_program__set_extension(struct bpf_program *prog); LIBBPF_API int bpf_program__set_sk_lookup(struct bpf_program *prog); +LIBBPF_API int bpf_program__set_sched(struct bpf_program *prog);
LIBBPF_API enum bpf_prog_type bpf_program__get_type(struct bpf_program *prog); LIBBPF_API void bpf_program__set_type(struct bpf_program *prog, @@ -388,6 +391,7 @@ LIBBPF_API bool bpf_program__is_tracing(const struct bpf_program *prog); LIBBPF_API bool bpf_program__is_struct_ops(const struct bpf_program *prog); LIBBPF_API bool bpf_program__is_extension(const struct bpf_program *prog); LIBBPF_API bool bpf_program__is_sk_lookup(const struct bpf_program *prog); +LIBBPF_API bool bpf_program__is_sched(const struct bpf_program *prog);
/* * No need for __attribute__((packed)), all members of 'bpf_map_def' diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 4ebfadf45b47..16393cea53d0 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -336,4 +336,7 @@ LIBBPF_0.2.0 { perf_buffer__epoll_fd; perf_buffer__consume_buffer; xsk_socket__create_shared; + bpf_program__attach_sched; + bpf_program__is_sched; + bpf_program__set_sched; } LIBBPF_0.1.0;
From: Roman Gushchin guro@fb.com
maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5F6X6 CVE: NA
Reference: https://lore.kernel.org/all/20210916162451.709260-1-guro@fb.com/
-------------------
Teach bpftool to recognize scheduler bpf programs.
Signed-off-by: Roman Gushchin guro@fb.com Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com --- tools/bpf/bpftool/common.c | 1 + tools/bpf/bpftool/prog.c | 1 + tools/lib/bpf/bpf.c | 3 ++- tools/lib/bpf/libbpf.c | 3 ++- 4 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index 6ebf2b215ef4..81def197e774 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -66,6 +66,7 @@ const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = { [BPF_MODIFY_RETURN] = "mod_ret", [BPF_LSM_MAC] = "lsm_mac", [BPF_SK_LOOKUP] = "sk_lookup", + [BPF_SCHED] = "sched", };
void p_err(const char *fmt, ...) diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 592536904dde..4e1d8e57d951 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -64,6 +64,7 @@ const char * const prog_type_name[] = { [BPF_PROG_TYPE_EXT] = "ext", [BPF_PROG_TYPE_LSM] = "lsm", [BPF_PROG_TYPE_SK_LOOKUP] = "sk_lookup", + [BPF_PROG_TYPE_SCHED] = "sched", };
const size_t prog_type_name_size = ARRAY_SIZE(prog_type_name); diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index d27e34133973..13e08c0d8b1a 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -236,7 +236,8 @@ int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, attr.prog_type == BPF_PROG_TYPE_LSM) { attr.attach_btf_id = load_attr->attach_btf_id; } else if (attr.prog_type == BPF_PROG_TYPE_TRACING || - attr.prog_type == BPF_PROG_TYPE_EXT) { + attr.prog_type == BPF_PROG_TYPE_EXT || + attr.prog_type == BPF_PROG_TYPE_SCHED) { attr.attach_btf_id = load_attr->attach_btf_id; attr.attach_prog_fd = load_attr->attach_prog_fd; } else { diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 223decdb93e2..2894d837e9f8 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6707,7 +6707,8 @@ load_program(struct bpf_program *prog, struct bpf_insn *insns, int insns_cnt, prog->type == BPF_PROG_TYPE_LSM) { load_attr.attach_btf_id = prog->attach_btf_id; } else if (prog->type == BPF_PROG_TYPE_TRACING || - prog->type == BPF_PROG_TYPE_EXT) { + prog->type == BPF_PROG_TYPE_EXT || + prog->type == BPF_PROG_TYPE_SCHED) { load_attr.attach_prog_fd = prog->attach_prog_fd; load_attr.attach_btf_id = prog->attach_btf_id; } else {