Chen Hui (8): sched: programmable: Add user interface of task tag sched: programmable: add bpf_sched_task_tag_of helper function bpf: sched: Add helper functions to get cpu statistics sched: programmable: Add helper function for cpu topology. sched: programmable: Add convenient helper functions to convert sched entity bpf:programmable: Add cpumask ops collection sched: programmable: Add three hooks in select_task_rq_fair() sched: programmable: Add lib for sched programmable
Guan Jing (10): bpf: sched: basic infrastructure for scheduler bpf bpf: sched: introduce bpf_sched_enable() libbpf: add support for scheduler bpf programs bpftool: recognize scheduler programs sched: programmable: Add a tag for the task sched: programmable: Add a tag for the task group sched: programmable: Add user interface of task group tag sched: programmable: add bpf_sched_tg_tag_of helper function sched: cfs: add bpf hooks to control wakeup and tick preemption sched: programmable: Add hook for pick next task
Hui Tang (1): bpf:programmable: Add helper func to check cpu share cache
Ren Zhijie (2): sched: programmable: Add helpers to set tag of task or task_group sched: programmable: add bpf hooks to update rq and task state in enqueue_task/deqeue_task of CFS
fs/proc/base.c | 65 ++++ include/linux/bpf_sched.h | 50 ++++ include/linux/bpf_topology.h | 46 +++ include/linux/bpf_types.h | 4 + include/linux/sched.h | 83 ++++++ include/linux/sched_hook_defs.h | 12 + include/uapi/linux/bpf.h | 125 ++++++++ init/init_task.c | 3 + kernel/bpf/Kconfig | 11 + kernel/bpf/btf.c | 1 + kernel/bpf/helpers.c | 12 + kernel/bpf/syscall.c | 24 ++ kernel/bpf/trampoline.c | 1 + kernel/bpf/verifier.c | 11 +- kernel/sched/bpf_sched.c | 395 +++++++++++++++++++++++++ kernel/sched/bpf_topology.c | 87 ++++++ kernel/sched/build_utility.c | 5 + kernel/sched/core.c | 114 +++++++ kernel/sched/fair.c | 121 ++++++++ kernel/sched/sched.h | 6 + scripts/bpf_doc.py | 18 ++ tools/include/uapi/linux/bpf.h | 125 ++++++++ tools/lib/bpf/Makefile | 2 +- tools/lib/bpf/bpf.c | 1 + tools/lib/bpf/libbpf.c | 23 +- tools/lib/bpf/libbpf.h | 2 + tools/lib/bpf/libbpf.map | 1 + tools/lib/bpf/libbpf_sched.h | 507 ++++++++++++++++++++++++++++++++ 28 files changed, 1852 insertions(+), 3 deletions(-) create mode 100644 include/linux/bpf_sched.h create mode 100644 include/linux/bpf_topology.h create mode 100644 include/linux/sched_hook_defs.h create mode 100644 kernel/sched/bpf_sched.c create mode 100644 kernel/sched/bpf_topology.c create mode 100644 tools/lib/bpf/libbpf_sched.h
maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7X7WW
Reference: https://lore.kernel.org/all/20210916162451.709260-1-guro@fb.com/
-------------------
This commit introduces basic definitions and infrastructure for scheduler bpf programs. It defines the BPF_PROG_TYPE_SCHED program type and the BPF_SCHED attachment type.
The implementation is inspired by lsm bpf programs and is based on kretprobes. This will allow to add new hooks with a minimal changes to the kernel code and without any changes to libbpf/bpftool. It's very convenient as I anticipate a large number of private patches being used for a long time before (or if at all) reaching upstream.
Sched programs are expected to return an int, which meaning will be context defined.
This patch doesn't add any real scheduler hooks (only a stub), it will be done by following patches in the series.
Scheduler bpf programs as now are very restricted in what they can do: only the bpf_printk() helper is available. The scheduler context can impose significant restrictions on what's safe and what's not. So let's extend their abilities on case by case basis when a need arise.
Signed-off-by: Roman Gushchin guro@fb.com Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com Signed-off-by: Guan Jing guanjing6@huawei.com --- include/linux/bpf_sched.h | 26 ++++++++++++++ include/linux/bpf_types.h | 4 +++ include/linux/sched_hook_defs.h | 2 ++ include/uapi/linux/bpf.h | 2 ++ kernel/bpf/Kconfig | 11 ++++++ kernel/bpf/btf.c | 1 + kernel/bpf/syscall.c | 13 +++++++ kernel/bpf/trampoline.c | 1 + kernel/bpf/verifier.c | 11 +++++- kernel/sched/bpf_sched.c | 62 +++++++++++++++++++++++++++++++++ kernel/sched/build_utility.c | 4 +++ tools/include/uapi/linux/bpf.h | 2 ++ tools/lib/bpf/bpf.c | 1 + 13 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 include/linux/bpf_sched.h create mode 100644 include/linux/sched_hook_defs.h create mode 100644 kernel/sched/bpf_sched.c
diff --git a/include/linux/bpf_sched.h b/include/linux/bpf_sched.h new file mode 100644 index 000000000000..874393e6a6aa --- /dev/null +++ b/include/linux/bpf_sched.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_BPF_SCHED_H +#define _LINUX_BPF_SCHED_H + +#include <linux/bpf.h> + +#ifdef CONFIG_BPF_SCHED + +#define BPF_SCHED_HOOK(RET, DEFAULT, NAME, ...) \ + RET bpf_sched_##NAME(__VA_ARGS__); +#include <linux/sched_hook_defs.h> +#undef BPF_SCHED_HOOK + +int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, + const struct bpf_prog *prog); + +#else /* !CONFIG_BPF_SCHED */ + +static inline int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, + const struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} + +#endif /* CONFIG_BPF_SCHED */ +#endif /* _LINUX_BPF_SCHED_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index fc0d6f32c687..dd79463eea4e 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -83,6 +83,10 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SYSCALL, bpf_syscall, BPF_PROG_TYPE(BPF_PROG_TYPE_NETFILTER, netfilter, struct bpf_nf_ctx, struct bpf_nf_ctx) #endif +#ifdef CONFIG_BPF_SCHED +BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED, bpf_sched, + void *, void *) +#endif /* CONFIG_BPF_SCHED */
BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/linux/sched_hook_defs.h b/include/linux/sched_hook_defs.h new file mode 100644 index 000000000000..14344004e335 --- /dev/null +++ b/include/linux/sched_hook_defs.h @@ -0,0 +1,2 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +BPF_SCHED_HOOK(int, 0, dummy, void) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c2273408bc16..99d1e9bb5aa3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -987,6 +987,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ BPF_PROG_TYPE_NETFILTER, + BPF_PROG_TYPE_SCHED, };
enum bpf_attach_type { @@ -1036,6 +1037,7 @@ enum bpf_attach_type { BPF_LSM_CGROUP, BPF_STRUCT_OPS, BPF_NETFILTER, + BPF_SCHED, __MAX_BPF_ATTACH_TYPE };
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index 2dfe1079f772..1d5c1b239e4a 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -99,4 +99,15 @@ config BPF_LSM
If you are unsure how to answer this question, answer N.
+config BPF_SCHED + bool "SCHED Instrumentation with BPF" + depends on BPF_EVENTS + depends on BPF_SYSCALL + help + Enables instrumentation of the sched hooks with eBPF programs for + implementing dynamic scheduling policies. + + If you are unsure howto answer this question, answer N. + + endmenu # "BPF subsystem" diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 72b32b7cd9cd..da042a4479c1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5973,6 +5973,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return true; t = btf_type_by_id(btf, t->type); break; + case BPF_SCHED: case BPF_MODIFY_RETURN: /* For now the BPF_MODIFY_RETURN can only be attached to * functions that return an int. diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f1c8733f76b8..5f05a0094e54 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2363,6 +2363,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_STRUCT_OPS: case BPF_PROG_TYPE_EXT: + case BPF_PROG_TYPE_SCHED: break; default: return -EINVAL; @@ -2490,6 +2491,7 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */ case BPF_PROG_TYPE_EXT: /* extends any prog */ + case BPF_PROG_TYPE_SCHED: return true; default: return false; @@ -3032,6 +3034,12 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, goto out_put_prog; } break; + case BPF_PROG_TYPE_SCHED: + if (prog->expected_attach_type != BPF_SCHED) { + err = -EINVAL; + goto out_put_prog; + } + break; default: err = -EINVAL; goto out_put_prog; @@ -3348,6 +3356,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_EXT: case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_SCHED: if (user_tp_name) /* The attach point for this category of programs * should be specified via btf_id during program load. @@ -3508,6 +3517,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_XDP; case BPF_LSM_CGROUP: return BPF_PROG_TYPE_LSM; + case BPF_SCHED: + return BPF_PROG_TYPE_SCHED; default: return BPF_PROG_TYPE_UNSPEC; } @@ -4599,6 +4610,7 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
switch (prog->type) { case BPF_PROG_TYPE_EXT: + case BPF_PROG_TYPE_SCHED: break; case BPF_PROG_TYPE_NETFILTER: if (attr->link_create.attach_type != BPF_NETFILTER) { @@ -4640,6 +4652,7 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = cgroup_bpf_link_attach(attr, prog); break; case BPF_PROG_TYPE_EXT: + case BPF_PROG_TYPE_SCHED: ret = bpf_tracing_prog_attach(prog, attr->link_create.target_fd, attr->link_create.target_btf_id, diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index ac021bc43a66..3104075969d5 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -487,6 +487,7 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) switch (prog->expected_attach_type) { case BPF_TRACE_FENTRY: return BPF_TRAMP_FENTRY; + case BPF_SCHED: case BPF_MODIFY_RETURN: return BPF_TRAMP_MODIFY_RETURN; case BPF_TRACE_FEXIT: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cf5f230360f5..20511c711931 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -25,6 +25,7 @@ #include <linux/btf_ids.h> #include <linux/poison.h> #include <linux/module.h> +#include <linux/bpf_sched.h>
#include "disasm.h"
@@ -18554,6 +18555,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, case BPF_LSM_CGROUP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_SCHED: if (!btf_type_is_func(t)) { bpf_log(log, "attach_btf_id %u is not a function\n", btf_id); @@ -18729,7 +18731,8 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
if (prog->type != BPF_PROG_TYPE_TRACING && prog->type != BPF_PROG_TYPE_LSM && - prog->type != BPF_PROG_TYPE_EXT) + prog->type != BPF_PROG_TYPE_EXT && + prog->type != BPF_PROG_TYPE_SCHED) return 0;
ret = bpf_check_attach_target(&env->log, prog, tgt_prog, btf_id, &tgt_info); @@ -18773,6 +18776,12 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return -EINVAL; }
+ if (prog->type == BPF_PROG_TYPE_SCHED) { + ret = bpf_sched_verify_prog(&env->log, prog); + if (ret < 0) + return ret; + } + key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id); tr = bpf_trampoline_get(key, &tgt_info); if (!tr) diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c new file mode 100644 index 000000000000..2360404d4a07 --- /dev/null +++ b/kernel/sched/bpf_sched.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <linux/cgroup.h> +#include <linux/bpf_verifier.h> +#include <linux/bpf_sched.h> +#include <linux/btf_ids.h> +#include "sched.h" + +/* + * For every hook declare a nop function where a BPF program can be attached. + */ +#define BPF_SCHED_HOOK(RET, DEFAULT, NAME, ...) \ +noinline RET bpf_sched_##NAME(__VA_ARGS__) \ +{ \ + return DEFAULT; \ +} + +#include <linux/sched_hook_defs.h> +#undef BPF_SCHED_HOOK + +#define BPF_SCHED_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_sched_##NAME) +BTF_SET_START(bpf_sched_hooks) +#include <linux/sched_hook_defs.h> +#undef BPF_SCHED_HOOK +BTF_SET_END(bpf_sched_hooks) + +int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, + const struct bpf_prog *prog) +{ + if (!prog->gpl_compatible) { + bpf_log(vlog, + "sched programs must have a GPL compatible license\n"); + return -EINVAL; + } + + if (!btf_id_set_contains(&bpf_sched_hooks, prog->aux->attach_btf_id)) { + bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n", + prog->aux->attach_btf_id, prog->aux->attach_func_name); + return -EINVAL; + } + + return 0; +} + +static const struct bpf_func_proto * +bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_trace_printk: + return bpf_get_trace_printk_proto(); + default: + return bpf_base_func_proto(func_id); + } +} + +const struct bpf_prog_ops bpf_sched_prog_ops = { +}; + +const struct bpf_verifier_ops bpf_sched_verifier_ops = { + .get_func_proto = bpf_sched_func_proto, + .is_valid_access = btf_ctx_access, +}; diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index 99bdd96f454f..d44c584d9bc7 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -108,3 +108,7 @@ #ifdef CONFIG_SCHED_AUTOGROUP # include "autogroup.c" #endif + +#ifdef CONFIG_BPF_SCHED +# include "bpf_sched.c" +#endif diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c2273408bc16..99d1e9bb5aa3 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -987,6 +987,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ BPF_PROG_TYPE_NETFILTER, + BPF_PROG_TYPE_SCHED, };
enum bpf_attach_type { @@ -1036,6 +1037,7 @@ enum bpf_attach_type { BPF_LSM_CGROUP, BPF_STRUCT_OPS, BPF_NETFILTER, + BPF_SCHED, __MAX_BPF_ATTACH_TYPE };
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 128ac723c4ea..33773ca93d18 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -725,6 +725,7 @@ int bpf_link_create(int prog_fd, int target_fd, case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: + case BPF_SCHED: case BPF_LSM_MAC: attr.link_create.tracing.cookie = OPTS_GET(opts, tracing.cookie, 0); if (!OPTS_ZEROED(opts, tracing))
maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7X7WW
Reference: https://lore.kernel.org/all/20210916162451.709260-1-guro@fb.com/
-------------------
Introduce a dedicated static key and the bpf_sched_enabled() wrapper to guard all invocations of bpf programs in the scheduler code.
It will help to avoid any potential performance regression in a case when no scheduler bpf programs are attached.
Signed-off-by: Roman Gushchin guro@fb.com Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com Signed-off-by: Guan Jing guanjing6@huawei.com --- include/linux/bpf_sched.h | 24 ++++++++++++++++++++++++ kernel/bpf/syscall.c | 11 +++++++++++ kernel/sched/bpf_sched.c | 2 ++ 3 files changed, 37 insertions(+)
diff --git a/include/linux/bpf_sched.h b/include/linux/bpf_sched.h index 874393e6a6aa..9cd2493d2787 100644 --- a/include/linux/bpf_sched.h +++ b/include/linux/bpf_sched.h @@ -6,6 +6,8 @@
#ifdef CONFIG_BPF_SCHED
+#include <linux/jump_label.h> + #define BPF_SCHED_HOOK(RET, DEFAULT, NAME, ...) \ RET bpf_sched_##NAME(__VA_ARGS__); #include <linux/sched_hook_defs.h> @@ -14,6 +16,23 @@ int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog);
+DECLARE_STATIC_KEY_FALSE(bpf_sched_enabled_key); + +static inline bool bpf_sched_enabled(void) +{ + return static_branch_unlikely(&bpf_sched_enabled_key); +} + +static inline void bpf_sched_inc(void) +{ + static_branch_inc(&bpf_sched_enabled_key); +} + +static inline void bpf_sched_dec(void) +{ + static_branch_dec(&bpf_sched_enabled_key); +} + #else /* !CONFIG_BPF_SCHED */
static inline int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, @@ -22,5 +41,10 @@ static inline int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, return -EOPNOTSUPP; }
+static inline bool bpf_sched_enabled(void) +{ + return false; +} + #endif /* CONFIG_BPF_SCHED */ #endif /* _LINUX_BPF_SCHED_H */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5f05a0094e54..422eb44ef6cd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -36,6 +36,7 @@ #include <linux/memcontrol.h> #include <linux/trace_events.h> #include <net/netfilter/nf_bpf_link.h> +#include <linux/bpf_sched.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -2951,6 +2952,11 @@ static void bpf_tracing_link_release(struct bpf_link *link) struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link);
+#ifdef CONFIG_BPF_SCHED + if (link->prog->type == BPF_PROG_TYPE_SCHED) + bpf_sched_dec(); +#endif + WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link, tr_link->trampoline));
@@ -3159,6 +3165,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, goto out_unlock; }
+#ifdef CONFIG_BPF_SCHED + if (prog->type == BPF_PROG_TYPE_SCHED) + bpf_sched_inc(); +#endif + link->tgt_prog = tgt_prog; link->trampoline = tr;
diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 2360404d4a07..e2525bd60abf 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -6,6 +6,8 @@ #include <linux/btf_ids.h> #include "sched.h"
+DEFINE_STATIC_KEY_FALSE(bpf_sched_enabled_key); + /* * For every hook declare a nop function where a BPF program can be attached. */
maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7X7WW
Reference: https://lore.kernel.org/all/20210916162451.709260-1-guro@fb.com/
-------------------
This patch adds a support for loading and attaching scheduler bpf programs.
Signed-off-by: Roman Gushchin guro@fb.com Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com Signed-off-by: Guan Jing guanjing6@huawei.com --- tools/lib/bpf/libbpf.c | 22 +++++++++++++++++++++- tools/lib/bpf/libbpf.h | 2 ++ tools/lib/bpf/libbpf.map | 1 + 3 files changed, 24 insertions(+), 1 deletion(-)
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index a27f6e9ccce7..7e9240076219 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -204,6 +204,7 @@ static const char * const prog_type_name[] = { [BPF_PROG_TYPE_SK_LOOKUP] = "sk_lookup", [BPF_PROG_TYPE_SYSCALL] = "syscall", [BPF_PROG_TYPE_NETFILTER] = "netfilter", + [BPF_PROG_TYPE_SCHED] = "sched", };
static int __base_pr(enum libbpf_print_level level, const char *format, @@ -2999,7 +3000,8 @@ static int bpf_object_fixup_btf(struct bpf_object *obj) static bool prog_needs_vmlinux_btf(struct bpf_program *prog) { if (prog->type == BPF_PROG_TYPE_STRUCT_OPS || - prog->type == BPF_PROG_TYPE_LSM) + prog->type == BPF_PROG_TYPE_LSM || + prog->type == BPF_PROG_TYPE_SCHED) return true;
/* BPF_PROG_TYPE_TRACING programs which do not attach to other programs @@ -8629,6 +8631,7 @@ static int attach_trace(const struct bpf_program *prog, long cookie, struct bpf_ static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_sched(const struct bpf_program *prog, long cookie, struct bpf_link **link);
static const struct bpf_sec_def section_defs[] = { SEC_DEF("socket", SOCKET_FILTER, 0, SEC_NONE), @@ -8714,6 +8717,7 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("struct_ops.s+", STRUCT_OPS, 0, SEC_SLEEPABLE), SEC_DEF("sk_lookup", SK_LOOKUP, BPF_SK_LOOKUP, SEC_ATTACHABLE), SEC_DEF("netfilter", NETFILTER, BPF_NETFILTER, SEC_NONE), + SEC_DEF("sched/", SCHED, BPF_SCHED, SEC_ATTACH_BTF, attach_sched), };
static size_t custom_sec_def_cnt; @@ -9096,6 +9100,7 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, #define BTF_TRACE_PREFIX "btf_trace_" #define BTF_LSM_PREFIX "bpf_lsm_" #define BTF_ITER_PREFIX "bpf_iter_" +#define BTF_SCHED_PREFIX "bpf_sched_" #define BTF_MAX_NAME_SIZE 128
void btf_get_kernel_prefix_kind(enum bpf_attach_type attach_type, @@ -9115,6 +9120,10 @@ void btf_get_kernel_prefix_kind(enum bpf_attach_type attach_type, *prefix = BTF_ITER_PREFIX; *kind = BTF_KIND_FUNC; break; + case BPF_SCHED: + *prefix = BTF_SCHED_PREFIX; + *kind = BTF_KIND_FUNC; + break; default: *prefix = ""; *kind = BTF_KIND_FUNC; @@ -11694,6 +11703,17 @@ static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_l return libbpf_get_error(*link); }
+struct bpf_link *bpf_program__attach_sched(const struct bpf_program *prog) +{ + return bpf_program__attach_btf_id(prog, NULL); +} + +static int attach_sched(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + *link = bpf_program__attach_sched(prog); + return libbpf_get_error(*link); +} + struct bpf_link *bpf_program__attach(const struct bpf_program *prog) { struct bpf_link *link = NULL; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 0b7362397ea3..ea015bf94864 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -717,6 +717,8 @@ bpf_program__attach_xdp(const struct bpf_program *prog, int ifindex); LIBBPF_API struct bpf_link * bpf_program__attach_freplace(const struct bpf_program *prog, int target_fd, const char *attach_func_name); +LIBBPF_API struct bpf_link * +bpf_program__attach_sched(const struct bpf_program *prog);
struct bpf_map;
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index a5aa3a383d69..5e7ce61f7c4d 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -236,6 +236,7 @@ LIBBPF_0.2.0 { perf_buffer__buffer_fd; perf_buffer__epoll_fd; perf_buffer__consume_buffer; + bpf_program__attach_sched; } LIBBPF_0.1.0;
LIBBPF_0.3.0 {
maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7X7WW
Reference: https://lore.kernel.org/all/20210916162451.709260-1-guro@fb.com/
-------------------
Teach bpftool to recognize scheduler bpf programs.
Signed-off-by: Roman Gushchin guro@fb.com Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com Signed-off-by: Guan Jing guanjing6@huawei.com --- tools/lib/bpf/libbpf.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 7e9240076219..0d30daeaf9f9 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -118,6 +118,7 @@ static const char * const attach_type_name[] = { [BPF_TRACE_KPROBE_MULTI] = "trace_kprobe_multi", [BPF_STRUCT_OPS] = "struct_ops", [BPF_NETFILTER] = "netfilter", + [BPF_SCHED] = "sched", };
static const char * const link_type_name[] = {
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7X7WW
--------------------------------
Add a tag for the task, useful to identify the special task. User can use the file system interface to mark different tags for specific workloads. The kernel subsystems can use the set_* helpers to mark it too. The bpf prog obtains the tags to detect different workloads.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com Signed-off-by: Guan Jing guanjing6@huawei.com --- include/linux/sched.h | 4 ++++ init/init_task.c | 3 +++ kernel/sched/core.c | 3 +++ 3 files changed, 10 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 6db0879089df..56d9c99d0247 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1545,6 +1545,10 @@ struct task_struct { const cpumask_t *select_cpus; #endif
+#ifdef CONFIG_BPF_SCHED + long tag; +#endif + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. diff --git a/init/init_task.c b/init/init_task.c index ac0c5850f74b..2101c6e3432d 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -213,6 +213,9 @@ struct task_struct init_task #ifdef CONFIG_SECCOMP_FILTER .seccomp = { .filter_count = ATOMIC_INIT(0) }, #endif +#ifdef CONFIG_BPF_SCHED + .tag = 0, +#endif }; EXPORT_SYMBOL(init_task);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 08ce8aada0b0..169f1e7f5317 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3763,6 +3763,9 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, rq->idle_stamp = 0; } #endif +#ifdef CONFIG_BPF_SCHED + p->tag = 0; +#endif }
/*
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7X7WW
--------------------------------
Add a tag for the task group, to support the tag-based scheduling mechanism.
The tag is used to identify a special task or a type of special tasks, there are many special tasks in the real world, such as foreground and background tasks, online and offline tasks, ect. so, we can identify such special tasks, and execute specific policies.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com Signed-off-by: Guan Jing guanjing6@huawei.com --- kernel/sched/core.c | 19 +++++++++++++++++++ kernel/sched/sched.h | 3 +++ 2 files changed, 22 insertions(+)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 169f1e7f5317..9fa00bcc5124 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10425,6 +10425,13 @@ static void sched_unregister_group(struct task_group *tg) call_rcu(&tg->rcu, sched_free_group_rcu); }
+#ifdef CONFIG_BPF_SCHED +static inline void tg_init_tag(struct task_group *tg, struct task_group *ptg) +{ + tg->tag = ptg->tag; +} +#endif + /* allocate runqueue etc for a new task group */ struct task_group *sched_create_group(struct task_group *parent) { @@ -10445,6 +10452,10 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err;
+#ifdef CONFIG_BPF_SCHED + tg_init_tag(tg, parent); +#endif + alloc_uclamp_sched_group(tg, parent);
return tg; @@ -10532,6 +10543,14 @@ static void sched_change_group(struct task_struct *tsk, struct task_group *group sched_change_qos_group(tsk, group); #endif
+#ifdef CONFIG_BPF_SCHED + /* + * This function has cleared and restored the task status, + * so we do not need to dequeue and enqueue the task again. + */ + tsk->tag = group->tag; +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_change_group) tsk->sched_class->task_change_group(tsk); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7393c1a62513..a5c84860ca39 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -422,6 +422,9 @@ struct task_group { struct uclamp_se uclamp[UCLAMP_CNT]; #endif
+#ifdef CONFIG_BPF_SCHED + long tag; +#endif };
#ifdef CONFIG_FAIR_GROUP_SCHED
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7X7WW
--------------------------------
Add user interface of task group tag, bridges the information gap between user-mode and kernel-mode.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com Signed-off-by: Guan Jing guanjing6@huawei.com --- include/linux/sched.h | 6 ++++ kernel/sched/core.c | 81 +++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 3 ++ 3 files changed, 90 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 56d9c99d0247..4160ae144b84 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1374,6 +1374,7 @@ struct task_struct { #ifdef CONFIG_KCSAN_WEAK_MEMORY int kcsan_stack_depth; #endif + #endif
#ifdef CONFIG_KMSAN @@ -2482,4 +2483,9 @@ static inline int sched_qos_cpu_overload(void) } #endif
+ +#ifdef CONFIG_BPF_SCHED +extern void sched_settag(struct task_struct *tsk, s64 tag); +#endif + #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9fa00bcc5124..792dc70feac9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11316,6 +11316,80 @@ static inline s64 cpu_qos_read(struct cgroup_subsys_state *css, } #endif
+#ifdef CONFIG_BPF_SCHED +void sched_settag(struct task_struct *tsk, s64 tag) +{ + int queued, running, queue_flags = + DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; + struct rq_flags rf; + struct rq *rq; + + if (tsk->tag == tag) + return; + + rq = task_rq_lock(tsk, &rf); + + running = task_current(rq, tsk); + queued = task_on_rq_queued(tsk); + + update_rq_clock(rq); + if (queued) + dequeue_task(rq, tsk, queue_flags); + if (running) + put_prev_task(rq, tsk); + + tsk->tag = tag; + + if (queued) + enqueue_task(rq, tsk, queue_flags); + if (running) + set_next_task(rq, tsk); + + task_rq_unlock(rq, tsk, &rf); +} + +int tg_change_tag(struct task_group *tg, void *data) +{ + struct css_task_iter it; + struct task_struct *tsk; + s64 tag = *(s64 *)data; + struct cgroup_subsys_state *css = &tg->css; + + tg->tag = tag; + + css_task_iter_start(css, 0, &it); + while ((tsk = css_task_iter_next(&it))) + sched_settag(tsk, tag); + css_task_iter_end(&it); + + return 0; +} + +static int cpu_tag_write(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 tag) +{ + struct task_group *tg = css_tg(css); + + if (tg == &root_task_group) + return -EINVAL; + + if (tg->tag == tag) + return 0; + + rcu_read_lock(); + walk_tg_tree_from(tg, tg_change_tag, tg_nop, (void *)(&tag)); + rcu_read_unlock(); + + return 0; +} + +static inline s64 cpu_tag_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return css_tg(css)->tag; +} +#endif + static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { @@ -11382,6 +11456,13 @@ static struct cftype cpu_legacy_files[] = { .read_s64 = cpu_qos_read, .write_s64 = cpu_qos_write, }, +#endif +#ifdef CONFIG_BPF_SCHED + { + .name = "tag", + .read_s64 = cpu_tag_read, + .write_s64 = cpu_tag_write, + }, #endif { } /* Terminate */ }; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a5c84860ca39..f4e65a5e3009 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -459,6 +459,9 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) }
extern int tg_nop(struct task_group *tg, void *data); +#ifdef CONFIG_BPF_SCHED +extern int tg_change_tag(struct task_group *tg, void *data); +#endif
extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7X7WW
--------------------------------
Add user interface of task tag, bridges the information gap between user-mode and kernel mode.
Add proc interface: /proc/${pid}/task/${pid}/tag
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com Signed-off-by: Guan Jing guanjing6@huawei.com --- fs/proc/base.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+)
diff --git a/fs/proc/base.c b/fs/proc/base.c index 420e1d572856..91b45d73c9d7 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3660,6 +3660,68 @@ static const struct inode_operations proc_tid_comm_inode_operations = { .permission = proc_tid_comm_permission, };
+#ifdef CONFIG_BPF_SCHED +static ssize_t pid_tag_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *tsk; + char buffer[PROC_NUMBUF]; + int err = 0, tag = 0; + + tsk = get_proc_task(inode); + if (!tsk) + return -ESRCH; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &tag); + if (err) + goto out; + + sched_settag(tsk, tag); + +out: + put_task_struct(tsk); + return err < 0 ? err : count; +} + +static int pid_tag_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *tsk; + + tsk = get_proc_task(inode); + if (!tsk) + return -ESRCH; + + seq_printf(m, "%ld\n", tsk->tag); + put_task_struct(tsk); + + return 0; +} + +static int pid_tag_open(struct inode *inode, struct file *flip) +{ + return single_open(flip, pid_tag_show, inode); +} + +static const struct file_operations proc_pid_tag_operations = { + .open = pid_tag_open, + .read = seq_read, + .write = pid_tag_write, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + /* * Tasks */ @@ -3770,6 +3832,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY REG("preferred_cpuset", 0644, proc_preferred_cpuset_operations), #endif +#ifdef CONFIG_BPF_SCHED + REG("tag", 0644, proc_pid_tag_operations), +#endif };
static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7X7WW
--------------------------------
This helper function read the task group tag for a task. The bpf prog obtains the tags to detect different workloads.
Signed-off-by: Ren Zhijie renzhijie2@huawei.com Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com Signed-off-by: Guan Jing guanjing6@huawei.com --- include/uapi/linux/bpf.h | 9 +++++++++ kernel/bpf/helpers.c | 3 +++ kernel/sched/bpf_sched.c | 23 +++++++++++++++++++++++ scripts/bpf_doc.py | 2 ++ tools/include/uapi/linux/bpf.h | 9 +++++++++ 5 files changed, 46 insertions(+)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 99d1e9bb5aa3..662bb9a46c8f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5574,6 +5574,14 @@ union bpf_attr { * Get Ipv4 origdst or replysrc. Works with IPv4. * Return * 0 on success, or a negative error in case of failure. + * + * long bpf_sched_tg_tag_of(struct task_group *tg) + * Description + * Return task group tag of *tg* if CONFIG_CGROUP_SCHED enabled. + * The bpf prog obtains the tags to detect different workloads. + * Return + * Task group tag, if CONFIG_CGROUP_SCHED enabled, 0 as default tag, or + * a negative error in case of failure. */ #define ___BPF_FUNC_MAPPER(FN, ctx...) \ FN(unspec, 0, ##ctx) \ @@ -5790,6 +5798,7 @@ union bpf_attr { FN(cgrp_storage_delete, 211, ##ctx) \ FN(get_sockops_uid_gid, 212, ##ctx) \ FN(sk_original_addr, 213, ##ctx) \ + FN(sched_tg_tag_of, 214, ##ctx) \ /* */
/* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 8d368fa353f9..d50b82093b57 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1654,6 +1654,7 @@ const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; const struct bpf_func_proto bpf_task_pt_regs_proto __weak; +const struct bpf_func_proto bpf_sched_tg_tag_of_proto __weak;
const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) @@ -1703,6 +1704,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_strtol_proto; case BPF_FUNC_strtoul: return &bpf_strtoul_proto; + case BPF_FUNC_sched_tg_tag_of: + return &bpf_sched_tg_tag_of_proto; default: break; } diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index e2525bd60abf..975c4f984856 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -62,3 +62,26 @@ const struct bpf_verifier_ops bpf_sched_verifier_ops = { .get_func_proto = bpf_sched_func_proto, .is_valid_access = btf_ctx_access, }; + +BPF_CALL_1(bpf_sched_tg_tag_of, struct task_group *, tg) +{ + int ret = 0; + +#ifdef CONFIG_CGROUP_SCHED + if (tg == NULL) + return -EINVAL; + ret = tg->tag; +#endif + + return ret; +} + +BTF_ID_LIST_SINGLE(btf_sched_tg_ids, struct, task_group) + +const struct bpf_func_proto bpf_sched_tg_tag_of_proto = { + .func = bpf_sched_tg_tag_of, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_tg_ids[0], +}; diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index eaae2ce78381..f091447792c0 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -700,6 +700,7 @@ class PrinterHelpers(Printer): 'struct bpf_dynptr', 'struct iphdr', 'struct ipv6hdr', + 'struct task_group', ] known_types = { '...', @@ -755,6 +756,7 @@ class PrinterHelpers(Printer): 'const struct bpf_dynptr', 'struct iphdr', 'struct ipv6hdr', + 'struct task_group', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 99d1e9bb5aa3..662bb9a46c8f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5574,6 +5574,14 @@ union bpf_attr { * Get Ipv4 origdst or replysrc. Works with IPv4. * Return * 0 on success, or a negative error in case of failure. + * + * long bpf_sched_tg_tag_of(struct task_group *tg) + * Description + * Return task group tag of *tg* if CONFIG_CGROUP_SCHED enabled. + * The bpf prog obtains the tags to detect different workloads. + * Return + * Task group tag, if CONFIG_CGROUP_SCHED enabled, 0 as default tag, or + * a negative error in case of failure. */ #define ___BPF_FUNC_MAPPER(FN, ctx...) \ FN(unspec, 0, ##ctx) \ @@ -5790,6 +5798,7 @@ union bpf_attr { FN(cgrp_storage_delete, 211, ##ctx) \ FN(get_sockops_uid_gid, 212, ##ctx) \ FN(sk_original_addr, 213, ##ctx) \ + FN(sched_tg_tag_of, 214, ##ctx) \ /* */
/* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7X7WW
--------------------------------
This helper function read the tag of the struct task. The bpf prog obtains the tags to detect different workloads.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com Signed-off-by: Guan Jing guanjing6@huawei.com --- include/uapi/linux/bpf.h | 8 ++++++++ kernel/bpf/helpers.c | 3 +++ kernel/sched/bpf_sched.c | 17 +++++++++++++++++ tools/include/uapi/linux/bpf.h | 8 ++++++++ 4 files changed, 36 insertions(+)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 662bb9a46c8f..aa071e9bde72 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5582,6 +5582,13 @@ union bpf_attr { * Return * Task group tag, if CONFIG_CGROUP_SCHED enabled, 0 as default tag, or * a negative error in case of failure. + * + * long bpf_sched_task_tag_of(struct task_struct *tsk) + * Description + * Return task tag of *tsk*.The bpf prog obtains the tags to detect + * different workloads. + * Return + * Task tag, if used, 0 as default tag, or a negative error in case of failure. */ #define ___BPF_FUNC_MAPPER(FN, ctx...) \ FN(unspec, 0, ##ctx) \ @@ -5799,6 +5806,7 @@ union bpf_attr { FN(get_sockops_uid_gid, 212, ##ctx) \ FN(sk_original_addr, 213, ##ctx) \ FN(sched_tg_tag_of, 214, ##ctx) \ + FN(sched_task_tag_of, 215, ##ctx) \ /* */
/* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index d50b82093b57..3424eba97132 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1655,6 +1655,7 @@ const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; const struct bpf_func_proto bpf_task_pt_regs_proto __weak; const struct bpf_func_proto bpf_sched_tg_tag_of_proto __weak; +const struct bpf_func_proto bpf_sched_task_tag_of_proto __weak;
const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) @@ -1706,6 +1707,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_strtoul_proto; case BPF_FUNC_sched_tg_tag_of: return &bpf_sched_tg_tag_of_proto; + case BPF_FUNC_sched_task_tag_of: + return &bpf_sched_task_tag_of_proto; default: break; } diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 975c4f984856..ad42e521cee3 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -85,3 +85,20 @@ const struct bpf_func_proto bpf_sched_tg_tag_of_proto = { .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_sched_tg_ids[0], }; + +BPF_CALL_1(bpf_sched_task_tag_of, struct task_struct *, tsk) +{ + if (tsk == NULL) + return -EINVAL; + return tsk->tag; +} + +BTF_ID_LIST_SINGLE(btf_sched_task_ids, struct, task_struct) + +const struct bpf_func_proto bpf_sched_task_tag_of_proto = { + .func = bpf_sched_task_tag_of, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_task_ids[0], +}; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 662bb9a46c8f..aa071e9bde72 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5582,6 +5582,13 @@ union bpf_attr { * Return * Task group tag, if CONFIG_CGROUP_SCHED enabled, 0 as default tag, or * a negative error in case of failure. + * + * long bpf_sched_task_tag_of(struct task_struct *tsk) + * Description + * Return task tag of *tsk*.The bpf prog obtains the tags to detect + * different workloads. + * Return + * Task tag, if used, 0 as default tag, or a negative error in case of failure. */ #define ___BPF_FUNC_MAPPER(FN, ctx...) \ FN(unspec, 0, ##ctx) \ @@ -5799,6 +5806,7 @@ union bpf_attr { FN(get_sockops_uid_gid, 212, ##ctx) \ FN(sk_original_addr, 213, ##ctx) \ FN(sched_tg_tag_of, 214, ##ctx) \ + FN(sched_task_tag_of, 215, ##ctx) \ /* */
/* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't
From: Ren Zhijie renzhijie2@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7X7WW
--------------------------------
Add helper function bpf_sched_set_tg_tag() and bpf_sched_set_task_tag() to set tag for task group or task.
They can not be call when rq->lock has been held.
The use case is that the other kernel subsystems, such as the network, can use it to mark key tasks.
Signed-off-by: Ren Zhijie renzhijie2@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com Signed-off-by: Guan Jing guanjing6@huawei.com --- include/uapi/linux/bpf.h | 14 +++++++++++ kernel/bpf/helpers.c | 6 +++++ kernel/sched/bpf_sched.c | 45 ++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 14 +++++++++++ 4 files changed, 79 insertions(+)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index aa071e9bde72..8ca15a8d2ff8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5589,6 +5589,18 @@ union bpf_attr { * different workloads. * Return * Task tag, if used, 0 as default tag, or a negative error in case of failure. + * + * int bpf_sched_set_tg_tag(struct task_group *tg, s64 tag) + * Description + * Set tag to *tg* and its descendants. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_set_task_tag(struct task_struct *tsk, s64 tag) + * Description + * Set tag to *tsk*. + * Return + * 0 on success, or a negative error in case of failure. */ #define ___BPF_FUNC_MAPPER(FN, ctx...) \ FN(unspec, 0, ##ctx) \ @@ -5807,6 +5819,8 @@ union bpf_attr { FN(sk_original_addr, 213, ##ctx) \ FN(sched_tg_tag_of, 214, ##ctx) \ FN(sched_task_tag_of, 215, ##ctx) \ + FN(sched_set_tg_tag, 216, ##ctx) \ + FN(sched_set_task_tag, 217, ##ctx) \ /* */
/* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 3424eba97132..0a00d21a6f47 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1656,6 +1656,8 @@ const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; const struct bpf_func_proto bpf_task_pt_regs_proto __weak; const struct bpf_func_proto bpf_sched_tg_tag_of_proto __weak; const struct bpf_func_proto bpf_sched_task_tag_of_proto __weak; +const struct bpf_func_proto bpf_sched_set_tg_tag_proto __weak; +const struct bpf_func_proto bpf_sched_set_task_tag_proto __weak;
const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) @@ -1709,6 +1711,10 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_sched_tg_tag_of_proto; case BPF_FUNC_sched_task_tag_of: return &bpf_sched_task_tag_of_proto; + case BPF_FUNC_sched_set_tg_tag: + return &bpf_sched_set_tg_tag_proto; + case BPF_FUNC_sched_set_task_tag: + return &bpf_sched_set_task_tag_proto; default: break; } diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index ad42e521cee3..31a76cf337e6 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -102,3 +102,48 @@ const struct bpf_func_proto bpf_sched_task_tag_of_proto = { .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_sched_task_ids[0], }; + +BPF_CALL_2(bpf_sched_set_tg_tag, struct task_group *, tg, s64, tag) +{ +#if CONFIG_CGROUP_SCHED + if (tg == NULL || tg == &root_task_group) + return -EINVAL; + + if (tg->tag == tag) + return 0; + + rcu_read_lock(); + walk_tg_tree_from(tg, tg_change_tag, tg_nop, (void *)(&tag)); + rcu_read_unlock(); + + return 0; +#endif + return -EPERM; +} + +const struct bpf_func_proto bpf_sched_set_tg_tag_proto = { + .func = bpf_sched_set_tg_tag, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_tg_ids[0], + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_sched_set_task_tag, struct task_struct *, tsk, s64, tag) +{ + if (tsk == NULL) + return -EINVAL; + + sched_settag(tsk, tag); + return 0; +} + +const struct bpf_func_proto bpf_sched_set_task_tag_proto = { + .func = bpf_sched_set_task_tag, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_task_ids[0], + .arg2_type = ARG_ANYTHING, +}; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index aa071e9bde72..8ca15a8d2ff8 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5589,6 +5589,18 @@ union bpf_attr { * different workloads. * Return * Task tag, if used, 0 as default tag, or a negative error in case of failure. + * + * int bpf_sched_set_tg_tag(struct task_group *tg, s64 tag) + * Description + * Set tag to *tg* and its descendants. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_set_task_tag(struct task_struct *tsk, s64 tag) + * Description + * Set tag to *tsk*. + * Return + * 0 on success, or a negative error in case of failure. */ #define ___BPF_FUNC_MAPPER(FN, ctx...) \ FN(unspec, 0, ##ctx) \ @@ -5807,6 +5819,8 @@ union bpf_attr { FN(sk_original_addr, 213, ##ctx) \ FN(sched_tg_tag_of, 214, ##ctx) \ FN(sched_task_tag_of, 215, ##ctx) \ + FN(sched_set_tg_tag, 216, ##ctx) \ + FN(sched_set_task_tag, 217, ##ctx) \ /* */
/* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7X7WW
--------------------------------
Add the helper functions to get cpu statistics, as follows: 1.acquire cfs/rt/irq cpu load statitic. 2.acquire multiple types of nr_running statitic. 3.acquire cpu idle statitic. 4.acquire cpu capacity.
Based on CPU statistics in different dimensions, specific scheduling policies can be implemented in bpf program.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com Signed-off-by: Guan Jing guanjing6@huawei.com --- include/linux/sched.h | 32 +++++++++++++++ include/uapi/linux/bpf.h | 7 ++++ kernel/sched/bpf_sched.c | 71 ++++++++++++++++++++++++++++++++++ scripts/bpf_doc.py | 2 + tools/include/uapi/linux/bpf.h | 7 ++++ 5 files changed, 119 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 4160ae144b84..b75403419fbd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2486,6 +2486,38 @@ static inline int sched_qos_cpu_overload(void)
#ifdef CONFIG_BPF_SCHED extern void sched_settag(struct task_struct *tsk, s64 tag); + +struct bpf_sched_cpu_stats { + /* load/util */ + unsigned long cfs_load_avg; + unsigned long cfs_runnable_avg; + unsigned long cfs_util_avg; + unsigned long rt_load_avg; + unsigned long rt_runnable_avg; + unsigned long rt_util_avg; + unsigned long irq_load_avg; + unsigned long irq_runnable_avg; + unsigned long irq_util_avg; + + /* nr_running */ + unsigned int nr_running; + unsigned int cfs_nr_running; + unsigned int cfs_h_nr_running; + unsigned int cfs_idle_h_nr_running; + unsigned int rt_nr_running; + unsigned int rr_nr_running; + + /* idle statistics */ + int available_idle; + unsigned int exit_latency; + unsigned long idle_stamp; + unsigned long avg_idle; + + /* capacity */ + unsigned long capacity; + unsigned long capacity_orig; +}; + #endif
#endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8ca15a8d2ff8..34641b24e699 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5601,6 +5601,12 @@ union bpf_attr { * Set tag to *tsk*. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_cpu_stats_of(int cpu, struct bpf_sched_cpu_stats *ctx, int len) + * Description + * Get multiple types of *cpu* statistics and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. */ #define ___BPF_FUNC_MAPPER(FN, ctx...) \ FN(unspec, 0, ##ctx) \ @@ -5821,6 +5827,7 @@ union bpf_attr { FN(sched_task_tag_of, 215, ##ctx) \ FN(sched_set_tg_tag, 216, ##ctx) \ FN(sched_set_task_tag, 217, ##ctx) \ + FN(sched_cpu_stats_of, 218, ##ctx) \ /* */
/* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 31a76cf337e6..815f5466637d 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -4,6 +4,7 @@ #include <linux/bpf_verifier.h> #include <linux/bpf_sched.h> #include <linux/btf_ids.h> +#include <linux/cpuidle.h> #include "sched.h"
DEFINE_STATIC_KEY_FALSE(bpf_sched_enabled_key); @@ -44,12 +45,82 @@ int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, return 0; }
+BPF_CALL_3(bpf_sched_cpu_stats_of, int, cpu, + struct bpf_sched_cpu_stats *, ctx, + int, len) +{ + struct cpuidle_state *idle; + struct rq *rq; + + if (len != sizeof(*ctx)) + return -EINVAL; + + if ((unsigned int)cpu >= nr_cpu_ids) + return -EINVAL; + + rq = cpu_rq(cpu); + memset(ctx, 0, sizeof(struct bpf_sched_cpu_stats)); + + /* load/util */ +#ifdef CONFIG_SMP + SCHED_WARN_ON(!rcu_read_lock_held()); + ctx->cfs_load_avg = rq->cfs.avg.load_avg; + ctx->cfs_runnable_avg = rq->cfs.avg.runnable_avg; + ctx->cfs_util_avg = rq->cfs.avg.util_avg; + ctx->rt_load_avg = rq->avg_rt.load_avg; + ctx->rt_runnable_avg = rq->avg_rt.runnable_avg; + ctx->rt_util_avg = rq->avg_rt.util_avg; +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ + ctx->irq_load_avg = rq->avg_irq.load_avg; + ctx->irq_runnable_avg = rq->avg_irq.runnable_avg; + ctx->irq_util_avg = rq->avg_irq.util_avg; +#endif +#endif + + /* nr_running */ + ctx->nr_running = rq->nr_running; + ctx->cfs_nr_running = rq->cfs.nr_running; + ctx->cfs_h_nr_running = rq->cfs.h_nr_running; + ctx->cfs_idle_h_nr_running = rq->cfs.idle_h_nr_running; + ctx->rt_nr_running = rq->rt.rt_nr_running; + ctx->rr_nr_running = rq->rt.rr_nr_running; + + /* idle statistics */ + ctx->available_idle = available_idle_cpu(cpu); + idle = idle_get_state(rq); + if (idle) + ctx->exit_latency = idle->exit_latency; +#ifdef CONFIG_SMP + ctx->idle_stamp = rq->idle_stamp; + ctx->avg_idle = rq->avg_idle; +#endif + + /* capacity */ +#ifdef CONFIG_SMP + ctx->capacity = rq->cpu_capacity; + ctx->capacity_orig = rq->cpu_capacity_orig; +#endif + + return 0; +} + +static const struct bpf_func_proto bpf_sched_cpu_stats_of_proto = { + .func = bpf_sched_cpu_stats_of, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + static const struct bpf_func_proto * bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_trace_printk: return bpf_get_trace_printk_proto(); + case BPF_FUNC_sched_cpu_stats_of: + return &bpf_sched_cpu_stats_of_proto; default: return bpf_base_func_proto(func_id); } diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index f091447792c0..e8bbfb801645 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -701,6 +701,7 @@ class PrinterHelpers(Printer): 'struct iphdr', 'struct ipv6hdr', 'struct task_group', + 'struct bpf_sched_cpu_stats', ] known_types = { '...', @@ -757,6 +758,7 @@ class PrinterHelpers(Printer): 'struct iphdr', 'struct ipv6hdr', 'struct task_group', + 'struct bpf_sched_cpu_stats', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 8ca15a8d2ff8..34641b24e699 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5601,6 +5601,12 @@ union bpf_attr { * Set tag to *tsk*. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_cpu_stats_of(int cpu, struct bpf_sched_cpu_stats *ctx, int len) + * Description + * Get multiple types of *cpu* statistics and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. */ #define ___BPF_FUNC_MAPPER(FN, ctx...) \ FN(unspec, 0, ##ctx) \ @@ -5821,6 +5827,7 @@ union bpf_attr { FN(sched_task_tag_of, 215, ##ctx) \ FN(sched_set_tg_tag, 216, ##ctx) \ FN(sched_set_task_tag, 217, ##ctx) \ + FN(sched_cpu_stats_of, 218, ##ctx) \ /* */
/* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7X7WW
--------------------------------
Add bpf helper function bpf_init_cpu_topology() which obtains cpu topology info through the macros topology_* that are defined by include/linux/topology.h, and save it in BPF MAP.
The cpu topology info are useful to select core in userspace.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com Signed-off-by: Guan Jing guanjing6@huawei.com --- include/linux/bpf_topology.h | 46 ++++++++++++++++++ include/uapi/linux/bpf.h | 14 ++++++ kernel/sched/bpf_sched.c | 8 ++++ kernel/sched/bpf_topology.c | 87 ++++++++++++++++++++++++++++++++++ kernel/sched/build_utility.c | 1 + scripts/bpf_doc.py | 4 ++ tools/include/uapi/linux/bpf.h | 14 ++++++ 7 files changed, 174 insertions(+) create mode 100644 include/linux/bpf_topology.h create mode 100644 kernel/sched/bpf_topology.c
diff --git a/include/linux/bpf_topology.h b/include/linux/bpf_topology.h new file mode 100644 index 000000000000..0c7ee492edde --- /dev/null +++ b/include/linux/bpf_topology.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _LINUX_BPF_TOPOLOGY_H +#define _LINUX_BPF_TOPOLOGY_H + +#include <linux/cpumask.h> + +struct bpf_cpu_topology { + int cpu; + int core_id; + int cluster_id; + int die_id; + int physical_package_id; + int numa_node; + struct cpumask thread_siblings; + struct cpumask core_siblings; + struct cpumask cluster_cpus; + struct cpumask die_cpus; + struct cpumask package_cpus; + struct cpumask node_cpu_lists; +}; + +struct bpf_cpumask_info { + unsigned int nums_possible_cpus; + unsigned int nums_active_cpus; + unsigned int nums_isolate_cpus; + unsigned int nr_cpu_ids; + unsigned int bpf_nr_cpumask_bits; + struct cpumask cpu_possible_cpumask; + struct cpumask cpu_active_cpumask; + struct cpumask cpu_isolate_cpumask; +}; + +#endif /* _LINUX_BPF_TOPOLOGY_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 34641b24e699..cb37b50acccd 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5607,6 +5607,18 @@ union bpf_attr { * Get multiple types of *cpu* statistics and store in *ctx*. * Return * 0 on success, or a negative error in case of failure. + * + * long bpf_init_cpu_topology(struct bpf_map *map) + * Description + * Initializing the cpu topology which used for bpf prog. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_get_cpumask_info(struct bpf_map *map, struct bpf_cpumask_info *cpus) + * Description + * Get system cpus returned in *cpus*. + * Return + * 0 on success, or a negative error in case of failure. */ #define ___BPF_FUNC_MAPPER(FN, ctx...) \ FN(unspec, 0, ##ctx) \ @@ -5828,6 +5840,8 @@ union bpf_attr { FN(sched_set_tg_tag, 216, ##ctx) \ FN(sched_set_task_tag, 217, ##ctx) \ FN(sched_cpu_stats_of, 218, ##ctx) \ + FN(init_cpu_topology, 219, ##ctx) \ + FN(get_cpumask_info, 220, ##ctx) \ /* */
/* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 815f5466637d..29303c1afbe7 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -5,6 +5,7 @@ #include <linux/bpf_sched.h> #include <linux/btf_ids.h> #include <linux/cpuidle.h> +#include <linux/bpf_topology.h> #include "sched.h"
DEFINE_STATIC_KEY_FALSE(bpf_sched_enabled_key); @@ -27,6 +28,9 @@ BTF_SET_START(bpf_sched_hooks) #undef BPF_SCHED_HOOK BTF_SET_END(bpf_sched_hooks)
+const struct bpf_func_proto bpf_init_cpu_topology_proto __weak; +const struct bpf_func_proto bpf_get_cpumask_info_proto __weak; + int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) { @@ -121,6 +125,10 @@ bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return bpf_get_trace_printk_proto(); case BPF_FUNC_sched_cpu_stats_of: return &bpf_sched_cpu_stats_of_proto; + case BPF_FUNC_init_cpu_topology: + return &bpf_init_cpu_topology_proto; + case BPF_FUNC_get_cpumask_info: + return &bpf_get_cpumask_info_proto; default: return bpf_base_func_proto(func_id); } diff --git a/kernel/sched/bpf_topology.c b/kernel/sched/bpf_topology.c new file mode 100644 index 000000000000..867c4b824366 --- /dev/null +++ b/kernel/sched/bpf_topology.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/bpf.h> +#include <linux/btf_ids.h> +#include <linux/bpf_verifier.h> +#include <linux/topology.h> +#include <linux/cpumask.h> +#include <linux/bpf_topology.h> +#include <linux/sched/isolation.h> + +static void bpf_update_cpu_topology(struct bpf_cpu_topology *bpf_cpu_topology, int cpu) +{ + bpf_cpu_topology->cpu = cpu; + bpf_cpu_topology->core_id = topology_core_id(cpu); + bpf_cpu_topology->cluster_id = topology_cluster_id(cpu); + bpf_cpu_topology->die_id = topology_die_id(cpu); + bpf_cpu_topology->physical_package_id = topology_physical_package_id(cpu); + bpf_cpu_topology->numa_node = cpu_to_node(cpu); + cpumask_copy(&bpf_cpu_topology->thread_siblings, topology_sibling_cpumask(cpu)); + cpumask_copy(&bpf_cpu_topology->core_siblings, topology_core_cpumask(cpu)); + cpumask_copy(&bpf_cpu_topology->cluster_cpus, topology_cluster_cpumask(cpu)); + cpumask_copy(&bpf_cpu_topology->die_cpus, topology_die_cpumask(cpu)); + cpumask_copy(&bpf_cpu_topology->package_cpus, topology_core_cpumask(cpu)); + cpumask_copy(&bpf_cpu_topology->node_cpu_lists, cpumask_of_node(cpu_to_node(cpu))); +} + +BPF_CALL_1(bpf_init_cpu_topology, struct bpf_map *, map) +{ + const struct cpumask *cpu_map = cpu_active_mask; + struct bpf_cpu_topology *topo; + int i = -1; + + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + for_each_cpu(i, cpu_map) { + topo = map->ops->map_lookup_elem(map, &i); + if (!topo) + return -ENOMEM; + + bpf_update_cpu_topology(topo, i); + } + + return 0; +} + +const struct bpf_func_proto bpf_init_cpu_topology_proto = { + .func = bpf_init_cpu_topology, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, +}; + +BPF_CALL_2(bpf_get_cpumask_info, struct bpf_map *, map, struct bpf_cpumask_info *, cpus) +{ + if (!cpus) + return -EINVAL; + + cpumask_copy(&cpus->cpu_possible_cpumask, cpu_possible_mask); + cpumask_copy(&cpus->cpu_active_cpumask, cpu_active_mask); + cpumask_copy(&cpus->cpu_isolate_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN)); + cpus->nums_possible_cpus = num_possible_cpus(); + cpus->nums_active_cpus = num_active_cpus(); + cpus->nums_isolate_cpus = cpumask_weight(&cpus->cpu_isolate_cpumask); + cpus->nr_cpu_ids = nr_cpu_ids; + cpus->bpf_nr_cpumask_bits = nr_cpumask_bits; + + return 0; +} + +const struct bpf_func_proto bpf_get_cpumask_info_proto = { + .func = bpf_get_cpumask_info, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, +}; diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index d44c584d9bc7..eb4fab8fd18f 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -111,4 +111,5 @@
#ifdef CONFIG_BPF_SCHED # include "bpf_sched.c" +# include "bpf_topology.c" #endif diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index e8bbfb801645..b23f07438978 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -702,6 +702,8 @@ class PrinterHelpers(Printer): 'struct ipv6hdr', 'struct task_group', 'struct bpf_sched_cpu_stats', + 'struct bpf_cpu_topology', + 'struct bpf_cpumask_info', ] known_types = { '...', @@ -759,6 +761,8 @@ class PrinterHelpers(Printer): 'struct ipv6hdr', 'struct task_group', 'struct bpf_sched_cpu_stats', + 'struct bpf_cpu_topology', + 'struct bpf_cpumask_info', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 34641b24e699..cb37b50acccd 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5607,6 +5607,18 @@ union bpf_attr { * Get multiple types of *cpu* statistics and store in *ctx*. * Return * 0 on success, or a negative error in case of failure. + * + * long bpf_init_cpu_topology(struct bpf_map *map) + * Description + * Initializing the cpu topology which used for bpf prog. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_get_cpumask_info(struct bpf_map *map, struct bpf_cpumask_info *cpus) + * Description + * Get system cpus returned in *cpus*. + * Return + * 0 on success, or a negative error in case of failure. */ #define ___BPF_FUNC_MAPPER(FN, ctx...) \ FN(unspec, 0, ##ctx) \ @@ -5828,6 +5840,8 @@ union bpf_attr { FN(sched_set_tg_tag, 216, ##ctx) \ FN(sched_set_task_tag, 217, ##ctx) \ FN(sched_cpu_stats_of, 218, ##ctx) \ + FN(init_cpu_topology, 219, ##ctx) \ + FN(get_cpumask_info, 220, ##ctx) \ /* */
/* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7X7WW
--------------------------------
Add three helper functions: 1) bpf_sched_entity_is_task is to check whether the sched entity is a task struct. 2) bpf_sched_entity_to_task is to change the sched entity to a task struct. 3) bpf_sched_entity_to_tg is to change the sched entity to a task group.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com Signed-off-by: Guan Jing guanjing6@huawei.com --- include/uapi/linux/bpf.h | 21 +++++++++++ kernel/sched/bpf_sched.c | 69 ++++++++++++++++++++++++++++++++-- scripts/bpf_doc.py | 2 + tools/include/uapi/linux/bpf.h | 21 +++++++++++ 4 files changed, 109 insertions(+), 4 deletions(-)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index cb37b50acccd..b224509c795b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5619,6 +5619,24 @@ union bpf_attr { * Get system cpus returned in *cpus*. * Return * 0 on success, or a negative error in case of failure. + * + * long bpf_sched_entity_is_task(struct sched_entity *se) + * Description + * Checks whether the sched entity is a task. + * Return + * 1 if true, 0 otherwise. + * + * struct task_struct *bpf_sched_entity_to_task(struct sched_entity *se) + * Description + * Return task struct of *se* if se is a task. + * Return + * Task struct if se is a task, NULL otherwise. + * + * struct task_group *bpf_sched_entity_to_tg(struct sched_entity *se) + * Description + * Return task group of *se* if se is a task group. + * Return + * Task struct if se is a task group, NULL otherwise. */ #define ___BPF_FUNC_MAPPER(FN, ctx...) \ FN(unspec, 0, ##ctx) \ @@ -5842,6 +5860,9 @@ union bpf_attr { FN(sched_cpu_stats_of, 218, ##ctx) \ FN(init_cpu_topology, 219, ##ctx) \ FN(get_cpumask_info, 220, ##ctx) \ + FN(sched_entity_is_task, 221, ##ctx) \ + FN(sched_entity_to_task, 222, ##ctx) \ + FN(sched_entity_to_tg, 223, ##ctx) \ /* */
/* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 29303c1afbe7..4d40f659e0d6 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -117,6 +117,65 @@ static const struct bpf_func_proto bpf_sched_cpu_stats_of_proto = { .arg3_type = ARG_CONST_SIZE, };
+BTF_ID_LIST_SINGLE(btf_sched_entity_ids, struct, sched_entity) +BTF_ID_LIST_SINGLE(btf_sched_task_ids, struct, task_struct) +BTF_ID_LIST_SINGLE(btf_sched_tg_ids, struct, task_group) + +BPF_CALL_1(bpf_sched_entity_is_task, struct sched_entity *, se) +{ + return entity_is_task(se) ? 1 : 0; +} + +static const struct bpf_func_proto bpf_sched_entity_is_task_proto = { + .func = bpf_sched_entity_is_task, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_entity_ids[0], +}; + +BPF_CALL_1(bpf_sched_entity_to_task, struct sched_entity *, se) +{ + if (entity_is_task(se)) { + struct task_struct *tsk = task_of(se); + + return (unsigned long)tsk; + } + + return (unsigned long)NULL; +} + +static const struct bpf_func_proto bpf_sched_entity_to_task_proto = { + .func = bpf_sched_entity_to_task, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .ret_btf_id = &btf_sched_task_ids[0], + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_entity_ids[0], +}; + +BPF_CALL_1(bpf_sched_entity_to_tg, struct sched_entity *, se) +{ +#if CONFIG_FAIR_GROUP_SCHED + if (!entity_is_task(se)) { + struct task_group *tg = group_cfs_rq(se)->tg; + + return (unsigned long)tg; + } +#endif + + return (unsigned long)NULL; +} + +static const struct bpf_func_proto bpf_sched_entity_to_tg_proto = { + .func = bpf_sched_entity_to_tg, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .ret_btf_id = &btf_sched_tg_ids[0], + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_entity_ids[0], +}; + static const struct bpf_func_proto * bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -129,6 +188,12 @@ bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_init_cpu_topology_proto; case BPF_FUNC_get_cpumask_info: return &bpf_get_cpumask_info_proto; + case BPF_FUNC_sched_entity_is_task: + return &bpf_sched_entity_is_task_proto; + case BPF_FUNC_sched_entity_to_task: + return &bpf_sched_entity_to_task_proto; + case BPF_FUNC_sched_entity_to_tg: + return &bpf_sched_entity_to_tg_proto; default: return bpf_base_func_proto(func_id); } @@ -155,8 +220,6 @@ BPF_CALL_1(bpf_sched_tg_tag_of, struct task_group *, tg) return ret; }
-BTF_ID_LIST_SINGLE(btf_sched_tg_ids, struct, task_group) - const struct bpf_func_proto bpf_sched_tg_tag_of_proto = { .func = bpf_sched_tg_tag_of, .gpl_only = false, @@ -172,8 +235,6 @@ BPF_CALL_1(bpf_sched_task_tag_of, struct task_struct *, tsk) return tsk->tag; }
-BTF_ID_LIST_SINGLE(btf_sched_task_ids, struct, task_struct) - const struct bpf_func_proto bpf_sched_task_tag_of_proto = { .func = bpf_sched_task_tag_of, .gpl_only = false, diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index b23f07438978..11cf353ad57f 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -704,6 +704,7 @@ class PrinterHelpers(Printer): 'struct bpf_sched_cpu_stats', 'struct bpf_cpu_topology', 'struct bpf_cpumask_info', + 'struct sched_entity', ] known_types = { '...', @@ -763,6 +764,7 @@ class PrinterHelpers(Printer): 'struct bpf_sched_cpu_stats', 'struct bpf_cpu_topology', 'struct bpf_cpumask_info', + 'struct sched_entity', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index cb37b50acccd..b224509c795b 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5619,6 +5619,24 @@ union bpf_attr { * Get system cpus returned in *cpus*. * Return * 0 on success, or a negative error in case of failure. + * + * long bpf_sched_entity_is_task(struct sched_entity *se) + * Description + * Checks whether the sched entity is a task. + * Return + * 1 if true, 0 otherwise. + * + * struct task_struct *bpf_sched_entity_to_task(struct sched_entity *se) + * Description + * Return task struct of *se* if se is a task. + * Return + * Task struct if se is a task, NULL otherwise. + * + * struct task_group *bpf_sched_entity_to_tg(struct sched_entity *se) + * Description + * Return task group of *se* if se is a task group. + * Return + * Task struct if se is a task group, NULL otherwise. */ #define ___BPF_FUNC_MAPPER(FN, ctx...) \ FN(unspec, 0, ##ctx) \ @@ -5842,6 +5860,9 @@ union bpf_attr { FN(sched_cpu_stats_of, 218, ##ctx) \ FN(init_cpu_topology, 219, ##ctx) \ FN(get_cpumask_info, 220, ##ctx) \ + FN(sched_entity_is_task, 221, ##ctx) \ + FN(sched_entity_to_task, 222, ##ctx) \ + FN(sched_entity_to_tg, 223, ##ctx) \ /* */
/* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7X7WW
--------------------------------
Add cpumask ops collection, such as cpumask_empty, cpumask_and, cpumask_andnot, cpumask_subset, cpumask_equal, cpumask_copy.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com Signed-off-by: Guan Jing guanjing6@huawei.com --- include/linux/sched.h | 23 +++++++++ include/uapi/linux/bpf.h | 43 +++++++++++++++++ kernel/sched/bpf_sched.c | 87 ++++++++++++++++++++++++++++++++++ scripts/bpf_doc.py | 4 ++ tools/include/uapi/linux/bpf.h | 43 +++++++++++++++++ 5 files changed, 200 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h index b75403419fbd..b5b45c32a6b9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2518,6 +2518,29 @@ struct bpf_sched_cpu_stats { unsigned long capacity_orig; };
+struct cpumask_op_args { + unsigned int op_type; + void *arg1; + void *arg2; + void *arg3; + void *arg4; +}; + +enum cpumask_op_type { + CPUMASK_EMPTY, + CPUMASK_AND, + CPUMASK_ANDNOT, + CPUMASK_SUBSET, + CPUMASK_EQUAL, + CPUMASK_TEST_CPU, + CPUMASK_COPY, + CPUMASK_WEIGHT, + CPUMASK_NEXT, + CPUMASK_NEXT_WRAP, + CPUMASK_NEXT_AND, + CPUMASK_CPULIST_PARSE +}; + #endif
#endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b224509c795b..afd786314ba9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5637,6 +5637,48 @@ union bpf_attr { * Return task group of *se* if se is a task group. * Return * Task struct if se is a task group, NULL otherwise. + * + * int bpf_cpumask_op(struct cpumask_op_args *op, int len) + * Description + * A series of cpumask-related operations. Perform different + * operations base on *op*->type. User also need fill other + * *op* field base on *op*->type. *op*->type is one of them + * + * **CPUMASK_EMPTY** + * *(op->arg1) == 0 returned. + * **CPUMASK_AND** + * *(op->arg1) = *(op->arg2) & *(op->arg3) + * **CPUMASK_ANDNOT** + * *(op->arg1) = *(op->arg2) & ~*(op->arg3) + * **CPUMASK_SUBSET** + * *(op->arg1) & ~*(op->arg2) == 0 returned + * **CPUMASK_EQUAL** + * *(op->arg1) == *(op->arg2) returned + * **CPUMASK_TEST_CPU** + * test for a cpu *(int)(op->arg1) in *(op->arg2) + * returns 1 if *op*->arg1 is set in *op*->arg2, else returns 0 + * **CPUMASK_COPY** + * *(op->arg1) = *(op->arg2), return 0 always + * **CPUMASK_WEIGHT** + * count of bits in *(op->arg1) + * **CPUMASK_NEXT** + * get the next cpu in *(struct cpumask *)(op->arg2) + * *(int *)(op->arg1): the cpu prior to the place to search + * **CPUMASK_NEXT_WRAP** + * helper to implement for_each_cpu_wrap + * @op->arg1: the cpu prior to the place to search + * @op->arg2: the cpumask pointer + * @op->arg3: the start point of the iteration + * @op->arg4: assume @op->arg1 crossing @op->arg3 terminates the iteration + * returns >= nr_cpu_ids on completion + * **CPUMASK_NEXT_AND** + * get the next cpu in *(op->arg1) & *(op->arg2) + * **CPUMASK_CPULIST_PARSE** + * extract a cpumask from a user string of ranges. + * (char *)op->arg1 -> (struct cpumask *)(op->arg2) + * 0 on success, or a negative error in case of failure. + * Return + * View above. */ #define ___BPF_FUNC_MAPPER(FN, ctx...) \ FN(unspec, 0, ##ctx) \ @@ -5863,6 +5905,7 @@ union bpf_attr { FN(sched_entity_is_task, 221, ##ctx) \ FN(sched_entity_to_task, 222, ##ctx) \ FN(sched_entity_to_tg, 223, ##ctx) \ + FN(cpumask_op, 224, ##ctx) \ /* */
/* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 4d40f659e0d6..5f798d2bb153 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -176,6 +176,91 @@ static const struct bpf_func_proto bpf_sched_entity_to_tg_proto = { .arg1_btf_id = &btf_sched_entity_ids[0], };
+BPF_CALL_2(bpf_cpumask_op, struct cpumask_op_args *, op, int, len) +{ + int ret; + + if (len != sizeof(*op) || !op->arg1) + return -EINVAL; + + switch (op->op_type) { + case CPUMASK_EMPTY: + return cpumask_empty((const struct cpumask *)op->arg1); + case CPUMASK_AND: + if (!op->arg2 || !op->arg3) + return -EINVAL; + return cpumask_and((struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2, + (const struct cpumask *)op->arg3); + case CPUMASK_ANDNOT: + if (!op->arg2 || !op->arg3) + return -EINVAL; + cpumask_andnot((struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2, + (const struct cpumask *)op->arg3); + break; + case CPUMASK_SUBSET: + if (!op->arg2) + return -EINVAL; + return cpumask_subset((const struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2); + case CPUMASK_EQUAL: + if (!op->arg2) + return -EINVAL; + return cpumask_equal((const struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2); + case CPUMASK_TEST_CPU: + if (!op->arg2) + return -EINVAL; + return cpumask_test_cpu(*(int *)op->arg1, op->arg2); + case CPUMASK_COPY: + if (!op->arg2) + return -EINVAL; + cpumask_copy((struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2); + break; + case CPUMASK_WEIGHT: + return cpumask_weight((const struct cpumask *)op->arg1); + case CPUMASK_NEXT: + if (!op->arg2) + return -EINVAL; + return cpumask_next(*(int *)op->arg1, + (const struct cpumask *)op->arg2); + case CPUMASK_NEXT_WRAP: + if (!op->arg2 || !op->arg3 || !op->arg4) + return -EINVAL; + return cpumask_next_wrap(*(int *)op->arg1, + (const struct cpumask *)op->arg2, + *(int *)op->arg3, *(int *)op->arg4); + case CPUMASK_NEXT_AND: + if (!op->arg2 || !op->arg3) + return -EINVAL; + return cpumask_next_and(*(int *)op->arg1, + (const struct cpumask *)op->arg2, + (const struct cpumask *)op->arg3); + case CPUMASK_CPULIST_PARSE: + if (!op->arg2) + return -EINVAL; + + op->arg1 = (void *)strstrip((void *)op->arg1); + ret = cpulist_parse((void *)op->arg1, + (struct cpumask *)op->arg2); + return ret; + default: + return -EINVAL; + } + + return 0; +} + +static const struct bpf_func_proto bpf_cpumask_op_proto = { + .func = bpf_cpumask_op, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, +}; + static const struct bpf_func_proto * bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -194,6 +279,8 @@ bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sched_entity_to_task_proto; case BPF_FUNC_sched_entity_to_tg: return &bpf_sched_entity_to_tg_proto; + case BPF_FUNC_cpumask_op: + return &bpf_cpumask_op_proto; default: return bpf_base_func_proto(func_id); } diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index 11cf353ad57f..7f5952a3be61 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -705,6 +705,8 @@ class PrinterHelpers(Printer): 'struct bpf_cpu_topology', 'struct bpf_cpumask_info', 'struct sched_entity', + 'struct cpumask', + 'struct cpumask_op_args', ] known_types = { '...', @@ -765,6 +767,8 @@ class PrinterHelpers(Printer): 'struct bpf_cpu_topology', 'struct bpf_cpumask_info', 'struct sched_entity', + 'struct cpumask', + 'struct cpumask_op_args', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b224509c795b..afd786314ba9 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5637,6 +5637,48 @@ union bpf_attr { * Return task group of *se* if se is a task group. * Return * Task struct if se is a task group, NULL otherwise. + * + * int bpf_cpumask_op(struct cpumask_op_args *op, int len) + * Description + * A series of cpumask-related operations. Perform different + * operations base on *op*->type. User also need fill other + * *op* field base on *op*->type. *op*->type is one of them + * + * **CPUMASK_EMPTY** + * *(op->arg1) == 0 returned. + * **CPUMASK_AND** + * *(op->arg1) = *(op->arg2) & *(op->arg3) + * **CPUMASK_ANDNOT** + * *(op->arg1) = *(op->arg2) & ~*(op->arg3) + * **CPUMASK_SUBSET** + * *(op->arg1) & ~*(op->arg2) == 0 returned + * **CPUMASK_EQUAL** + * *(op->arg1) == *(op->arg2) returned + * **CPUMASK_TEST_CPU** + * test for a cpu *(int)(op->arg1) in *(op->arg2) + * returns 1 if *op*->arg1 is set in *op*->arg2, else returns 0 + * **CPUMASK_COPY** + * *(op->arg1) = *(op->arg2), return 0 always + * **CPUMASK_WEIGHT** + * count of bits in *(op->arg1) + * **CPUMASK_NEXT** + * get the next cpu in *(struct cpumask *)(op->arg2) + * *(int *)(op->arg1): the cpu prior to the place to search + * **CPUMASK_NEXT_WRAP** + * helper to implement for_each_cpu_wrap + * @op->arg1: the cpu prior to the place to search + * @op->arg2: the cpumask pointer + * @op->arg3: the start point of the iteration + * @op->arg4: assume @op->arg1 crossing @op->arg3 terminates the iteration + * returns >= nr_cpu_ids on completion + * **CPUMASK_NEXT_AND** + * get the next cpu in *(op->arg1) & *(op->arg2) + * **CPUMASK_CPULIST_PARSE** + * extract a cpumask from a user string of ranges. + * (char *)op->arg1 -> (struct cpumask *)(op->arg2) + * 0 on success, or a negative error in case of failure. + * Return + * View above. */ #define ___BPF_FUNC_MAPPER(FN, ctx...) \ FN(unspec, 0, ##ctx) \ @@ -5863,6 +5905,7 @@ union bpf_attr { FN(sched_entity_is_task, 221, ##ctx) \ FN(sched_entity_to_task, 222, ##ctx) \ FN(sched_entity_to_tg, 223, ##ctx) \ + FN(cpumask_op, 224, ##ctx) \ /* */
/* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't
From: Hui Tang tanghui20@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7X7WW
--------------------------------
Add helper function to check two cpu whehter share same LLC cache.
Signed-off-by: Hui Tang tanghui20@huawei.com Signed-off-by: Guan Jing guanjing6@huawei.com --- include/uapi/linux/bpf.h | 7 +++++++ kernel/sched/bpf_sched.c | 19 +++++++++++++++++++ tools/include/uapi/linux/bpf.h | 7 +++++++ 3 files changed, 33 insertions(+)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index afd786314ba9..d66c08937f7a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5679,6 +5679,12 @@ union bpf_attr { * 0 on success, or a negative error in case of failure. * Return * View above. + * + * int bpf_cpus_share_cache(int src_cpu, int dst_cpu) + * Description + * check src_cpu whether share cache with dst_cpu. + * Return + * yes 1, no 0. */ #define ___BPF_FUNC_MAPPER(FN, ctx...) \ FN(unspec, 0, ##ctx) \ @@ -5906,6 +5912,7 @@ union bpf_attr { FN(sched_entity_to_task, 222, ##ctx) \ FN(sched_entity_to_tg, 223, ##ctx) \ FN(cpumask_op, 224, ##ctx) \ + FN(cpus_share_cache, 225, ##ctx) \ /* */
/* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 5f798d2bb153..912661b7caff 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -261,6 +261,23 @@ static const struct bpf_func_proto bpf_cpumask_op_proto = { .arg2_type = ARG_CONST_SIZE, };
+BPF_CALL_2(bpf_cpus_share_cache, int, src_cpu, int, dst_cpu) +{ + if ((unsigned int)src_cpu >= nr_cpu_ids || + (unsigned int)dst_cpu >= nr_cpu_ids) + return 0; + + return cpus_share_cache(src_cpu, dst_cpu); +} + +static const struct bpf_func_proto bpf_cpus_share_cache_proto = { + .func = bpf_cpus_share_cache, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -281,6 +298,8 @@ bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sched_entity_to_tg_proto; case BPF_FUNC_cpumask_op: return &bpf_cpumask_op_proto; + case BPF_FUNC_cpus_share_cache: + return &bpf_cpus_share_cache_proto; default: return bpf_base_func_proto(func_id); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index afd786314ba9..ec23f842f054 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5679,6 +5679,12 @@ union bpf_attr { * 0 on success, or a negative error in case of failure. * Return * View above. + * + * int bpf_cpus_share_cache(int src_cpu, int dst_cpu) + * Description + * check src_cpu whether share cache with dst_cpu. + * Return + * true yes, false no. */ #define ___BPF_FUNC_MAPPER(FN, ctx...) \ FN(unspec, 0, ##ctx) \ @@ -5906,6 +5912,7 @@ union bpf_attr { FN(sched_entity_to_task, 222, ##ctx) \ FN(sched_entity_to_tg, 223, ##ctx) \ FN(cpumask_op, 224, ##ctx) \ + FN(cpus_share_cache, 225, ##ctx) \ /* */
/* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't
maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7X7WW
Reference: https://lore.kernel.org/all/20210916162451.709260-1-guro@fb.com/
-------------------
This patch adds 3 hooks to control wakeup and tick preemption: cfs_check_preempt_tick cfs_check_preempt_wakeup cfs_wakeup_preempt_entity
The first one allows to force or suppress a preemption from a tick context. An obvious usage example is to minimize the number of non-voluntary context switches and decrease an associated latency penalty by (conditionally) providing tasks or task groups an extended execution slice. It can be used instead of tweaking sysctl_sched_min_granularity.
The second one is called from the wakeup preemption code and allows to redefine whether a newly woken task should preempt the execution of the current task. This is useful to minimize a number of preemptions of latency sensitive tasks. To some extent it's a more flexible analog of a sysctl_sched_wakeup_granularity.
The third one is similar, but it tweaks the wakeup_preempt_entity() function, which is called not only from a wakeup context, but also from pick_next_task(), which allows to influence the decision on which task will be running next.
It's a place for a discussion whether we need both these hooks or only one of them: the second is more powerful, but depends more on the current implementation. In any case, bpf hooks are not an ABI, so it's not a deal breaker.
The idea of the wakeup_preempt_entity hook belongs to Rik van Riel. He also contributed a lot to the whole patchset by proving his ideas, recommendations and a feedback for earlier (non-public) versions.
Signed-off-by: Roman Gushchin guro@fb.com Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com Signed-off-by: Guan Jing guanjing6@huawei.com --- include/linux/sched_hook_defs.h | 5 ++++- kernel/sched/fair.c | 36 +++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-)
diff --git a/include/linux/sched_hook_defs.h b/include/linux/sched_hook_defs.h index 14344004e335..e2f65e4b8895 100644 --- a/include/linux/sched_hook_defs.h +++ b/include/linux/sched_hook_defs.h @@ -1,2 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -BPF_SCHED_HOOK(int, 0, dummy, void) +BPF_SCHED_HOOK(int, 0, cfs_check_preempt_tick, struct sched_entity *curr, unsigned long delta_exec) +BPF_SCHED_HOOK(int, 0, cfs_check_preempt_wakeup, struct task_struct *curr, struct task_struct *p) +BPF_SCHED_HOOK(int, 0, cfs_wakeup_preempt_entity, struct sched_entity *curr, + struct sched_entity *se) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bc58182b201f..707b830d19cf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -168,6 +168,7 @@ int __weak arch_asym_cpu_priority(int cpu) */ #define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078) #endif +#include <linux/bpf_sched.h>
#ifdef CONFIG_QOS_SCHED
@@ -5025,6 +5026,21 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency);
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; + +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + int ret = bpf_sched_cfs_check_preempt_tick(curr, delta_exec); + + if (ret < 0) + return; + else if (ret > 0) { + resched_curr(rq_of(cfs_rq)); + clear_buddies(cfs_rq, curr); + return; + } + } +#endif + if (delta_exec > ideal_runtime) { resched_curr(rq_of(cfs_rq)); /* @@ -7995,6 +8011,15 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) { s64 gran, vdiff = curr->vruntime - se->vruntime;
+#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + int ret = bpf_sched_cfs_wakeup_preempt_entity(curr, se); + + if (ret) + return ret; + } +#endif + if (vdiff <= 0) return -1;
@@ -8080,6 +8105,17 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ likely(!task_has_idle_policy(p))) goto preempt;
+#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + int ret = bpf_sched_cfs_check_preempt_wakeup(current, p); + + if (ret < 0) + return; + else if (ret > 0) + goto preempt; + } +#endif + /* * Batch and idle tasks do not preempt non-idle tasks (their preemption * is driven by the tick):
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7X7WW
--------------------------------
This hook point can change the position of se on the red-black tree, eg: in cloud scenarios, there will be online tasks that need to respond in time and offline tasks that do not need to respond in time. This hook point provides users with a way to customize that Class tasks run first. The basis for pick next task comes from system information, such as the red-black tree, and so on… If the system information of the CFS is modified, it will affect the whole system. Therefore, the hook function is added here. Only the position of the task on the red-black tree is modified, and the value of vruntime is not changed.
Signed-off-by: Guan Jing guanjing6@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com --- include/linux/sched_hook_defs.h | 2 ++ kernel/sched/fair.c | 9 +++++++++ 2 files changed, 11 insertions(+)
diff --git a/include/linux/sched_hook_defs.h b/include/linux/sched_hook_defs.h index e2f65e4b8895..834e327e481b 100644 --- a/include/linux/sched_hook_defs.h +++ b/include/linux/sched_hook_defs.h @@ -3,3 +3,5 @@ BPF_SCHED_HOOK(int, 0, cfs_check_preempt_tick, struct sched_entity *curr, unsign BPF_SCHED_HOOK(int, 0, cfs_check_preempt_wakeup, struct task_struct *curr, struct task_struct *p) BPF_SCHED_HOOK(int, 0, cfs_wakeup_preempt_entity, struct sched_entity *curr, struct sched_entity *se) +BPF_SCHED_HOOK(int, 0, cfs_tag_pick_next_entity, const struct sched_entity *curr, + const struct sched_entity *next) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 707b830d19cf..b0b5e6f58adc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -620,6 +620,15 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) static inline bool entity_before(const struct sched_entity *a, const struct sched_entity *b) { +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + int ret = bpf_sched_cfs_tag_pick_next_entity(a, b); + + if (ret == 1) + return 1; + } +#endif + return (s64)(a->vruntime - b->vruntime) < 0; }
From: Ren Zhijie renzhijie2@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7X7WW
--------------------------------
Add 2 hooks to update rq and task state in enqueue_task_fair() and dequeue_task_fair(): cfs_enqueue_task cfs_dequeue_task
When entering and leaving the runqeue, these hooks use to get and update context of interest to bpf prog after the runqueue completes the status change.
Signed-off-by: Ren Zhijie renzhijie2@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com Signed-off-by: Guan Jing guanjing6@huawei.com --- include/linux/sched_hook_defs.h | 2 ++ kernel/sched/fair.c | 10 ++++++++++ 2 files changed, 12 insertions(+)
diff --git a/include/linux/sched_hook_defs.h b/include/linux/sched_hook_defs.h index 834e327e481b..9350f2b7148e 100644 --- a/include/linux/sched_hook_defs.h +++ b/include/linux/sched_hook_defs.h @@ -5,3 +5,5 @@ BPF_SCHED_HOOK(int, 0, cfs_wakeup_preempt_entity, struct sched_entity *curr, struct sched_entity *se) BPF_SCHED_HOOK(int, 0, cfs_tag_pick_next_entity, const struct sched_entity *curr, const struct sched_entity *next) +BPF_SCHED_HOOK(void, (void) 0, cfs_enqueue_task, struct rq *rq, struct task_struct *p) +BPF_SCHED_HOOK(void, (void) 0, cfs_dequeue_task, struct rq *rq, struct task_struct *p) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b0b5e6f58adc..3e80b31c2f58 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6433,6 +6433,11 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) assert_list_leaf_cfs_rq(rq);
hrtick_update(rq); + +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) + bpf_sched_cfs_enqueue_task(rq, p); +#endif }
static void set_next_buddy(struct sched_entity *se); @@ -6510,6 +6515,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) dequeue_throttle: util_est_update(&rq->cfs, p, task_sleep); hrtick_update(rq); + +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) + bpf_sched_cfs_dequeue_task(rq, p); +#endif }
#ifdef CONFIG_SMP
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7X7WW
--------------------------------
Add three hooks of sched type in select_task_rq_fair(), as follows: 'cfs_select_rq' Replace the original core selection policy or implement dynamic CPU affinity.
'cfs_select_rq_exit' Restoring the CPU affinity of the task before exiting of 'select_task_rq_fair'.
To be used with 'cfs_select_rq' hook to implement dynamic CPU affinity.
'cfs_wake_affine' Determine on which CPU task can run soonest. Allow user to implement deferent policies.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com Signed-off-by: Guan Jing guanjing6@huawei.com --- include/linux/sched.h | 18 +++++++++ include/linux/sched_hook_defs.h | 3 ++ kernel/sched/core.c | 11 ++++++ kernel/sched/fair.c | 66 +++++++++++++++++++++++++++++++++ scripts/bpf_doc.py | 4 ++ 5 files changed, 102 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h index b5b45c32a6b9..73ddff132493 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2541,6 +2541,24 @@ enum cpumask_op_type { CPUMASK_CPULIST_PARSE };
+struct sched_migrate_ctx { + struct task_struct *task; + struct cpumask *select_idle_mask; + int prev_cpu; + int curr_cpu; + int is_sync; + int want_affine; + int wake_flags; + int sd_flag; + int new_cpu; +}; + +struct sched_affine_ctx { + struct task_struct *task; + int prev_cpu; + int curr_cpu; + int is_sync; +}; #endif
#endif diff --git a/include/linux/sched_hook_defs.h b/include/linux/sched_hook_defs.h index 9350f2b7148e..f8d1381a1a9a 100644 --- a/include/linux/sched_hook_defs.h +++ b/include/linux/sched_hook_defs.h @@ -7,3 +7,6 @@ BPF_SCHED_HOOK(int, 0, cfs_tag_pick_next_entity, const struct sched_entity *curr const struct sched_entity *next) BPF_SCHED_HOOK(void, (void) 0, cfs_enqueue_task, struct rq *rq, struct task_struct *p) BPF_SCHED_HOOK(void, (void) 0, cfs_dequeue_task, struct rq *rq, struct task_struct *p) +BPF_SCHED_HOOK(int, -1, cfs_select_rq, struct sched_migrate_ctx *ctx) +BPF_SCHED_HOOK(int, -1, cfs_wake_affine, struct sched_affine_ctx *ctx) +BPF_SCHED_HOOK(int, -1, cfs_select_rq_exit, struct sched_migrate_ctx *ctx) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 792dc70feac9..652c06bd546d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9887,6 +9887,10 @@ LIST_HEAD(task_groups); static struct kmem_cache *task_group_cache __read_mostly; #endif
+#ifdef CONFIG_BPF_SCHED +DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); +#endif + void __init sched_init(void) { unsigned long ptr = 0; @@ -9931,6 +9935,13 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ }
+#if defined(CONFIG_CPUMASK_OFFSTACK) && defined(CONFIG_BPF_SCHED) + for_each_possible_cpu(i) { + per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node( + cpumask_size(), GFP_KERNEL, cpu_to_node(i)); + } +#endif + init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
#ifdef CONFIG_SMP diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3e80b31c2f58..d9af04551788 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -77,6 +77,10 @@ unsigned int sysctl_sched_latency = 6000000ULL; static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
+#ifdef CONFIG_BPF_SCHED +DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); +#endif + /* * The initial- and re-scaling of tunables is configurable * @@ -6739,6 +6743,22 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, { int target = nr_cpumask_bits;
+#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + struct sched_affine_ctx ctx; + int ret; + + ctx.task = p; + ctx.prev_cpu = prev_cpu; + ctx.curr_cpu = this_cpu; + ctx.is_sync = sync; + + ret = bpf_sched_cfs_wake_affine(&ctx); + if (ret >= 0 && ret < nr_cpumask_bits) + return ret; + } +#endif + if (sched_feat(WA_IDLE)) target = wake_affine_idle(this_cpu, prev_cpu, sync);
@@ -7857,6 +7877,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) int idlest_cpu = 0; #endif
+#ifdef CONFIG_BPF_SCHED + struct sched_migrate_ctx ctx; + cpumask_t *cpus_prev = NULL; + cpumask_t *cpus; + int ret; +#endif + /* * required for stable ->cpus_allowed */ @@ -7884,6 +7911,32 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) }
rcu_read_lock(); +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + ctx.task = p; + ctx.prev_cpu = prev_cpu; + ctx.curr_cpu = cpu; + ctx.is_sync = sync; + ctx.wake_flags = wake_flags; + ctx.want_affine = want_affine; + ctx.sd_flag = sd_flag; + ctx.select_idle_mask = this_cpu_cpumask_var_ptr(select_idle_mask); + + ret = bpf_sched_cfs_select_rq(&ctx); + if (ret >= 0) { + rcu_read_unlock(); + return ret; + } else if (ret != -1) { + cpus = this_cpu_cpumask_var_ptr(select_idle_mask); + if (cpumask_subset(cpus, p->cpus_ptr) && + !cpumask_empty(cpus)) { + cpus_prev = (void *)p->cpus_ptr; + p->cpus_ptr = cpus; + } + } + } +#endif + for_each_domain(cpu, tmp) { /* * If both 'cpu' and 'prev_cpu' are part of this domain, @@ -7922,6 +7975,19 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) /* Fast path */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); } + +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + ctx.new_cpu = new_cpu; + ret = bpf_sched_cfs_select_rq_exit(&ctx); + if (ret >= 0) + new_cpu = ret; + + if (cpus_prev) + p->cpus_ptr = cpus_prev; + } +#endif + rcu_read_unlock();
#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index 7f5952a3be61..d17415b53a88 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -707,6 +707,8 @@ class PrinterHelpers(Printer): 'struct sched_entity', 'struct cpumask', 'struct cpumask_op_args', + 'struct sched_migrate_ctx', + 'struct sched_affine_ctx', ] known_types = { '...', @@ -769,6 +771,8 @@ class PrinterHelpers(Printer): 'struct sched_entity', 'struct cpumask', 'struct cpumask_op_args', + 'struct sched_migrate_ctx', + 'struct sched_affine_ctx', } mapped_types = { 'u8': '__u8',
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7X7WW
--------------------------------
Add lib for sched programmable, this functions help user program more easily.
The main functions are as follows: 1.Wrap the helper functions make more easily to use. 2.Implement some generic methods and policies for scheduler.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com Signed-off-by: Guan Jing guanjing6@huawei.com --- tools/lib/bpf/Makefile | 2 +- tools/lib/bpf/libbpf_sched.h | 507 +++++++++++++++++++++++++++++++++++ 2 files changed, 508 insertions(+), 1 deletion(-) create mode 100644 tools/lib/bpf/libbpf_sched.h
diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index cf7f02c67968..4ec1e06c05bb 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -239,7 +239,7 @@ install_lib: all_cmd
SRC_HDRS := bpf.h libbpf.h btf.h libbpf_common.h libbpf_legacy.h \ bpf_helpers.h bpf_tracing.h bpf_endian.h bpf_core_read.h \ - skel_internal.h libbpf_version.h usdt.bpf.h + skel_internal.h libbpf_version.h usdt.bpf.h libbpf_sched.h GEN_HDRS := $(BPF_GENERATED)
INSTALL_PFX := $(DESTDIR)$(prefix)/include/bpf diff --git a/tools/lib/bpf/libbpf_sched.h b/tools/lib/bpf/libbpf_sched.h new file mode 100644 index 000000000000..3e6f34718dbc --- /dev/null +++ b/tools/lib/bpf/libbpf_sched.h @@ -0,0 +1,507 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#ifndef __LIBBPF_LIBSCHED_H +#define __LIBBPF_LIBSCHED_H + +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +/* set bigger value may lead verifier failed */ +#define BPF_SCHED_LOOP_MAX 1024 +#define INVALID_PTR ((void *)(0UL)) +#define getVal(P) \ + ({ \ + typeof(P) val = 0; \ + bpf_probe_read_kernel(&val, sizeof(val), &(P)); \ + val; \ + }) + +static __always_inline long libbpf_cpumask_next(int n, struct cpumask *mask); +static __always_inline long libbpf_cpumask_next_wrap(int n, + struct cpumask *mask, + int start, int wrap); +static __always_inline long libbpf_cpumask_next_and(int n, + struct cpumask *mask1, + struct cpumask *mask2); +static __always_inline int libbpf_nr_cpus_ids(void); +static __always_inline int libbpf_nr_cpumask_bits(void); + +#if NR_CPUS == 1 + +#define libbpf_for_each_cpu(cpu, mask) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) +#define libbpf_for_each_cpu_wrap(cpu, mask, start) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)(start)) +#define libbpf_for_each_cpu_and(cpu, mask1, mask2) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask1, (void)mask2) + +#else + +#define libbpf_for_each_cpu(cpu, mask) \ + for (int __i = 0, (cpu) = -1; \ + (cpu) = libbpf_cpumask_next((cpu), (mask)), \ + (cpu) < libbpf_nr_cpus_ids() && __i < NR_CPUS; __i++) + +#define libbpf_for_each_cpu_wrap(cpu, mask, start) \ + for (int __i = 0, (cpu) = libbpf_cpumask_next_wrap((start) - 1,\ + (mask), (start), false); \ + (cpu) < libbpf_nr_cpumask_bits() && __i < NR_CPUS; \ + (cpu) = libbpf_cpumask_next_wrap((cpu), (mask), (start),\ + true), __i++) + +#define libbpf_for_each_cpu_and(cpu, mask1, mask2) \ + for (int __i = 0, (cpu) = -1; \ + (cpu) = libbpf_cpumask_next_and((cpu), (mask1), (mask2)),\ + (cpu) < libbpf_nr_cpus_ids() && __i < NR_CPUS; __i++) + +#endif + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, struct bpf_cpumask_info); + __uint(max_entries, 1); +} map_cpumask_info SEC(".maps"); + +static __always_inline long libbpf_cpumask_copy(struct cpumask *dst, + struct cpumask *src) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_COPY; + op.arg1 = dst; + op.arg2 = src; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_empty(struct cpumask *mask) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_EMPTY; + op.arg1 = mask; + op.arg2 = INVALID_PTR; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_and(struct cpumask *dst, + struct cpumask *src1, + struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_AND; + op.arg1 = dst; + op.arg2 = src1; + op.arg3 = src2; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_andnot(struct cpumask *dst, + struct cpumask *src1, + struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_ANDNOT; + op.arg1 = dst; + op.arg2 = src1; + op.arg3 = src2; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_subset(struct cpumask *src1, + struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_SUBSET; + op.arg1 = src1; + op.arg2 = src2; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_equal(struct cpumask *src1, + struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_EQUAL; + op.arg1 = src1; + op.arg2 = src2; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_weight(struct cpumask *src1) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_WEIGHT; + op.arg1 = src1; + op.arg2 = INVALID_PTR; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_test_cpu(int cpu, + struct cpumask *mask) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_TEST_CPU; + op.arg1 = &cpu; + op.arg2 = mask; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_next(int n, struct cpumask *mask) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_NEXT; + op.arg1 = &n; + op.arg2 = mask; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_next_wrap(int n, + struct cpumask *mask, + int start, int wrap) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_NEXT_WRAP; + op.arg1 = &n; + op.arg2 = mask; + op.arg3 = &start; + op.arg4 = &wrap; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_next_and(int n, + struct cpumask *mask1, + struct cpumask *mask2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_NEXT_AND; + op.arg1 = &n; + op.arg2 = mask1; + op.arg3 = mask2; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_cpulist_parse(char *src1, + struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_CPULIST_PARSE; + op.arg1 = src1; + op.arg2 = src2; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline int libbpf_num_active_cpus(void) +{ + struct bpf_cpumask_info *cpus; + int key = 0; + + cpus = bpf_map_lookup_elem(&map_cpumask_info, &key); + if (!cpus) + return -1; + + bpf_get_cpumask_info(&map_cpumask_info, cpus); + return getVal(cpus->nums_active_cpus); +} + +static __always_inline int libbpf_num_possible_cpus(void) +{ + struct bpf_cpumask_info *cpus; + int key = 0; + + cpus = bpf_map_lookup_elem(&map_cpumask_info, &key); + if (!cpus) + return -1; + + bpf_get_cpumask_info(&map_cpumask_info, cpus); + return getVal(cpus->nums_possible_cpus); +} + +static __always_inline void libbpf_possible_cpus_mask(struct cpumask *mask) +{ + struct bpf_cpumask_info *cpus; + int key = 0; + + cpus = bpf_map_lookup_elem(&map_cpumask_info, &key); + if (!cpus) + return; + + bpf_get_cpumask_info(&map_cpumask_info, cpus); + libbpf_cpumask_copy(mask, &cpus->cpu_possible_cpumask); +} + +static __always_inline void libbpf_active_cpus_mask(struct cpumask *mask) +{ + struct bpf_cpumask_info *cpus; + int key = 0; + + cpus = bpf_map_lookup_elem(&map_cpumask_info, &key); + if (!cpus) + return; + + bpf_get_cpumask_info(&map_cpumask_info, cpus); + libbpf_cpumask_copy(mask, &cpus->cpu_active_cpumask); +} + +static __always_inline void libbpf_isolate_cpus_mask(struct cpumask *mask) +{ + struct bpf_cpumask_info *cpus; + int key = 0; + + cpus = bpf_map_lookup_elem(&map_cpumask_info, &key); + if (!cpus) + return; + + bpf_get_cpumask_info(&map_cpumask_info, cpus); + libbpf_cpumask_copy(mask, &cpus->cpu_isolate_cpumask); +} + +static __always_inline int libbpf_nr_cpus_ids(void) +{ + struct bpf_cpumask_info *cpus; + int key = 0; + + cpus = bpf_map_lookup_elem(&map_cpumask_info, &key); + if (!cpus) + return -1; + + bpf_get_cpumask_info(&map_cpumask_info, cpus); + return getVal(cpus->nr_cpu_ids); +} + +static __always_inline int libbpf_nr_cpumask_bits(void) +{ + struct bpf_cpumask_info *cpus; + int key = 0; + + cpus = bpf_map_lookup_elem(&map_cpumask_info, &key); + if (!cpus) + return -1; + + bpf_get_cpumask_info(&map_cpumask_info, cpus); + return getVal(cpus->bpf_nr_cpumask_bits); +} + +static __always_inline unsigned long libbpf_cfs_load_avg_of(int cpu) +{ + struct bpf_sched_cpu_stats load; + + bpf_sched_cpu_stats_of(cpu, &load, sizeof(load)); + return getVal(load.cfs_load_avg); +} + +static __always_inline unsigned long libbpf_cfs_runnable_avg_of(int cpu) +{ + struct bpf_sched_cpu_stats load; + + bpf_sched_cpu_stats_of(cpu, &load, sizeof(load)); + return getVal(load.cfs_runnable_avg); +} + +static __always_inline unsigned long libbpf_cfs_util_avg_of(int cpu) +{ + struct bpf_sched_cpu_stats load; + + bpf_sched_cpu_stats_of(cpu, &load, sizeof(load)); + return getVal(load.cfs_util_avg); +} + +static __always_inline unsigned long libbpf_rt_load_avg_of(int cpu) +{ + struct bpf_sched_cpu_stats load; + + bpf_sched_cpu_stats_of(cpu, &load, sizeof(load)); + return load.rt_load_avg; +} + +static __always_inline unsigned long libbpf_rt_runnable_avg_of(int cpu) +{ + struct bpf_sched_cpu_stats load; + + bpf_sched_cpu_stats_of(cpu, &load, sizeof(load)); + return load.rt_runnable_avg; +} + +static __always_inline unsigned long libbpf_rt_util_avg_of(int cpu) +{ + struct bpf_sched_cpu_stats load; + + bpf_sched_cpu_stats_of(cpu, &load, sizeof(load)); + return load.rt_util_avg; +} + +static __always_inline unsigned long libbpf_irq_load_avg_of(int cpu) +{ + struct bpf_sched_cpu_stats load; + + bpf_sched_cpu_stats_of(cpu, &load, sizeof(load)); + return load.irq_load_avg; +} + +static __always_inline unsigned long libbpf_irq_util_avg_of(int cpu) +{ + struct bpf_sched_cpu_stats load; + + bpf_sched_cpu_stats_of(cpu, &load, sizeof(load)); + return load.irq_util_avg; +} + +static __always_inline unsigned int libbpf_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_stats running; + + bpf_sched_cpu_stats_of(cpu, &running, sizeof(running)); + return getVal(running.nr_running); +} + +static __always_inline unsigned int libbpf_cfs_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_stats running; + + bpf_sched_cpu_stats_of(cpu, &running, sizeof(running)); + return getVal(running.cfs_nr_running); +} + +static __always_inline unsigned int libbpf_cfs_h_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_stats running; + + bpf_sched_cpu_stats_of(cpu, &running, sizeof(running)); + return getVal(running.cfs_h_nr_running); +} + +static __always_inline unsigned int libbpf_cfs_idle_h_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_stats running; + + bpf_sched_cpu_stats_of(cpu, &running, sizeof(running)); + return running.cfs_idle_h_nr_running; +} + +static __always_inline unsigned int libbpf_rt_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_stats running; + + bpf_sched_cpu_stats_of(cpu, &running, sizeof(running)); + return getVal(running.rt_nr_running); +} + +static __always_inline unsigned int libbpf_rr_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_stats running; + + bpf_sched_cpu_stats_of(cpu, &running, sizeof(running)); + return running.rr_nr_running; +} + +static __always_inline unsigned int libbpf_exit_latency_of(int cpu) +{ + struct bpf_sched_cpu_stats stat; + + bpf_sched_cpu_stats_of(cpu, &stat, sizeof(stat)); + return stat.exit_latency; +} + +static __always_inline unsigned long libbpf_idle_stamp_of(int cpu) +{ + struct bpf_sched_cpu_stats stat; + + bpf_sched_cpu_stats_of(cpu, &stat, sizeof(stat)); + return stat.idle_stamp; +} + +static __always_inline unsigned long libbpf_avg_idle_of(int cpu) +{ + struct bpf_sched_cpu_stats stat; + + bpf_sched_cpu_stats_of(cpu, &stat, sizeof(stat)); + return stat.avg_idle; +} + +static __always_inline unsigned long libbpf_available_idle_cpu(int cpu) +{ + struct bpf_sched_cpu_stats stat; + + bpf_sched_cpu_stats_of(cpu, &stat, sizeof(stat)); + return getVal(stat.available_idle); +} + +static __always_inline unsigned long libbpf_capacity_of(int cpu) +{ + struct bpf_sched_cpu_stats cap; + + bpf_sched_cpu_stats_of(cpu, &cap, sizeof(cap)); + return getVal(cap.capacity); +} + +static __always_inline unsigned long libbpf_capacity_orig_of(int cpu) +{ + struct bpf_sched_cpu_stats cap; + + bpf_sched_cpu_stats_of(cpu, &cap, sizeof(cap)); + return cap.capacity_orig; +} + +static __always_inline int libbpf_cpus_share_cache(int src_cpu, int dst_cpu) +{ + return bpf_cpus_share_cache(src_cpu, dst_cpu); +} + +static __always_inline int libbpf_sched_se_tag_of(struct sched_entity *se) +{ + int se_tag = 0; + + if (bpf_sched_entity_is_task(se)) { + struct task_struct *task = bpf_sched_entity_to_task(se); + + se_tag = bpf_sched_task_tag_of(task); + } else { + struct task_group *tg = bpf_sched_entity_to_tg(se); + + se_tag = bpf_sched_tg_tag_of(tg); + } + + return se_tag; +} +#endif
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/2013 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/2...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/2013 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/2...