sched: programmable: Fix sched bpf hook cfs_check_preempt_tick
Chen Hui (11): sched: programmable: Add a tag for the task group sched: programmable: Add user interface of task group tag sched: programmable: Add a tag for the task sched: programmable: Add user interface of task tag sched: programmable: add bpf_sched_task_tag_of helper function sched: programmable: Add convenient helper functions to convert sched entity samples/bpf: Update bpf loader for sched section names bpf: sched: Add four helper functions to get cpu stat bpf:programmable: Add cpumask ops collection sched: programmable: Add lib for sched programmable sched: programmable: Add three hooks in select_task_rq_fair()
Guan Jing (2): sched: programmable: Add hook for pick next task samples: bpf: Add bpf sched pick task sample
Hui Tang (3): bpf:programmable: Add helper func to check cpu share cache bpf:programmable: Add helper func to set cpus_ptr in task samples:bpf: Add samples for cfs select core
Ren Zhijie (5): sched: programmable: Fix sched bpf hook cfs_check_preempt_tick sched: programmable: add bpf_sched_tg_tag_of helper function samples: bpf: Add bpf sched preempt sample program sched: programmable: Add helpers to set tag of task or task_group sched: programmable: Add helper function for cpu topology.
fs/proc/base.c | 65 ++++ include/linux/bpf_topology.h | 46 +++ include/linux/sched.h | 85 +++++ include/linux/sched_hook_defs.h | 5 + include/uapi/linux/bpf.h | 151 +++++++++ init/init_task.c | 3 + kernel/bpf/helpers.c | 12 + kernel/bpf/verifier.c | 4 +- kernel/sched/Makefile | 3 +- kernel/sched/bpf_sched.c | 433 ++++++++++++++++++++++++ kernel/sched/bpf_topology.c | 97 ++++++ kernel/sched/core.c | 103 ++++++ kernel/sched/fair.c | 64 +++- kernel/sched/sched.h | 8 + samples/bpf/Makefile | 9 + samples/bpf/bpf_load.c | 8 +- samples/bpf/sched_pick_task_kern.c | 56 ++++ samples/bpf/sched_pick_task_user.c | 94 ++++++ samples/bpf/sched_preempt_kern.c | 147 +++++++++ samples/bpf/sched_preempt_user.c | 140 ++++++++ samples/bpf/sched_select_core_kern.c | 259 +++++++++++++++ samples/bpf/sched_select_core_user.c | 125 +++++++ scripts/bpf_helpers_doc.py | 20 ++ tools/include/uapi/linux/bpf.h | 151 +++++++++ tools/lib/bpf/libbpf_sched.h | 473 +++++++++++++++++++++++++++ 25 files changed, 2555 insertions(+), 6 deletions(-) create mode 100644 include/linux/bpf_topology.h create mode 100644 kernel/sched/bpf_topology.c create mode 100644 samples/bpf/sched_pick_task_kern.c create mode 100644 samples/bpf/sched_pick_task_user.c create mode 100644 samples/bpf/sched_preempt_kern.c create mode 100644 samples/bpf/sched_preempt_user.c create mode 100644 samples/bpf/sched_select_core_kern.c create mode 100644 samples/bpf/sched_select_core_user.c create mode 100644 tools/lib/bpf/libbpf_sched.h
From: Ren Zhijie renzhijie2@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
The logic of this hook is the same as that of the branch that current sched entity delta_exec time bigger than ideal_runtime.
Fixes: 05661011fcac ("sched: cfs: add bpf hooks to control wakeup and tick preemption") Signed-off-by: Ren Zhijie renzhijie2@huawei.com --- kernel/sched/fair.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3395d102b43e..79e36870b206 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4482,8 +4482,11 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
if (ret < 0) return; - else if (ret > 0) + else if (ret > 0) { resched_curr(rq_of(cfs_rq)); + clear_buddies(cfs_rq, curr); + return; + } } #endif
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
Add a tag for the task group, to support the tag-based scheduling mechanism.
The tag is used to identify a special task or a type of special tasks, there are many special tasks in the real world, such as foreground and background tasks, online and offline tasks, ect. so, we can identify such special tasks, and execute specific policies.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com --- kernel/sched/core.c | 19 +++++++++++++++++++ kernel/sched/sched.h | 5 +++++ 2 files changed, 24 insertions(+)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 51c707897c8d..c53c032a378a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7755,6 +7755,13 @@ static void sched_free_group(struct task_group *tg) kmem_cache_free(task_group_cache, tg); }
+#ifdef CONFIG_BPF_SCHED +static inline void tg_init_tag(struct task_group *tg, struct task_group *ptg) +{ + tg->tag = ptg->tag; +} +#endif + /* allocate runqueue etc for a new task group */ struct task_group *sched_create_group(struct task_group *parent) { @@ -7775,6 +7782,10 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err;
+#ifdef CONFIG_BPF_SCHED + tg_init_tag(tg, parent); +#endif + alloc_uclamp_sched_group(tg, parent);
return tg; @@ -7846,6 +7857,14 @@ static void sched_change_group(struct task_struct *tsk, int type) sched_change_qos_group(tsk, tg); #endif
+#ifdef CONFIG_BPF_SCHED + /* + * This function has cleared and restored the task status, + * so we do not need to dequeue and enqueue the task again. + */ + tsk->tag = tg->tag; +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_change_group) tsk->sched_class->task_change_group(tsk, type); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 42d5fb7d9464..d44d2ee8799b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -455,7 +455,12 @@ struct task_group { struct uclamp_se uclamp[UCLAMP_CNT]; #endif
+#ifdef CONFIG_BPF_SCHED + /* Used to pad the tag of a group */ + long tag; +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4)
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
Add user interface of task group tag, bridges the information gap between user-mode and kernel-mode.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com --- include/linux/sched.h | 4 +++ kernel/sched/core.c | 81 +++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 3 ++ 3 files changed, 88 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h index c0aa1ea09ac6..b306cf82e597 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2177,4 +2177,8 @@ static inline int sched_qos_cpu_overload(void) return 0; } #endif + +#ifdef CONFIG_BPF_SCHED +extern void sched_settag(struct task_struct *tsk, s64 tag); +#endif #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c53c032a378a..c07440743059 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8637,6 +8637,80 @@ static inline s64 cpu_qos_read(struct cgroup_subsys_state *css, } #endif
+#ifdef CONFIG_BPF_SCHED +void sched_settag(struct task_struct *tsk, s64 tag) +{ + int queued, running, queue_flags = + DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; + struct rq_flags rf; + struct rq *rq; + + if (tsk->tag == tag) + return; + + rq = task_rq_lock(tsk, &rf); + + running = task_current(rq, tsk); + queued = task_on_rq_queued(tsk); + + update_rq_clock(rq); + if (queued) + dequeue_task(rq, tsk, queue_flags); + if (running) + put_prev_task(rq, tsk); + + tsk->tag = tag; + + if (queued) + enqueue_task(rq, tsk, queue_flags); + if (running) + set_next_task(rq, tsk); + + task_rq_unlock(rq, tsk, &rf); +} + +int tg_change_tag(struct task_group *tg, void *data) +{ + struct css_task_iter it; + struct task_struct *tsk; + s64 tag = *(s64 *)data; + struct cgroup_subsys_state *css = &tg->css; + + tg->tag = tag; + + css_task_iter_start(css, 0, &it); + while ((tsk = css_task_iter_next(&it))) + sched_settag(tsk, tag); + css_task_iter_end(&it); + + return 0; +} + +static int cpu_tag_write(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 tag) +{ + struct task_group *tg = css_tg(css); + + if (tg == &root_task_group) + return -EINVAL; + + if (tg->tag == tag) + return 0; + + rcu_read_lock(); + walk_tg_tree_from(tg, tg_change_tag, tg_nop, (void *)(&tag)); + rcu_read_unlock(); + + return 0; +} + +static inline s64 cpu_tag_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return css_tg(css)->tag; +} +#endif + static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { @@ -8698,6 +8772,13 @@ static struct cftype cpu_legacy_files[] = { .read_s64 = cpu_qos_read, .write_s64 = cpu_qos_write, }, +#endif +#ifdef CONFIG_BPF_SCHED + { + .name = "tag", + .read_s64 = cpu_tag_read, + .write_s64 = cpu_tag_write, + }, #endif { } /* Terminate */ }; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d44d2ee8799b..1aeccc5111d7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -498,6 +498,9 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) }
extern int tg_nop(struct task_group *tg, void *data); +#ifdef CONFIG_BPF_SCHED +extern int tg_change_tag(struct task_group *tg, void *data); +#endif
extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
Add a tag for the task, useful to identify the special task. User can use the file system interface to mark different tags for specific workloads. The kernel subsystems can use the set_* helpers to mark it too. The bpf prog obtains the tags to detect different workloads.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com --- include/linux/sched.h | 5 +++++ init/init_task.c | 3 +++ kernel/sched/core.c | 3 +++ 3 files changed, 11 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h index b306cf82e597..d6db602da068 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1397,7 +1397,12 @@ struct task_struct { */ randomized_struct_fields_end
+#ifdef CONFIG_BPF_SCHED + /* Used to pad the tag of a task */ + long tag; +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) diff --git a/init/init_task.c b/init/init_task.c index 5fa18ed59d33..7003426df677 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -213,6 +213,9 @@ struct task_struct init_task #ifdef CONFIG_SECCOMP_FILTER .seccomp = { .filter_count = ATOMIC_INIT(0) }, #endif +#ifdef CONFIG_BPF_SCHED + .tag = 0, +#endif }; EXPORT_SYMBOL(init_task);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c07440743059..c0fd31446c70 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3113,6 +3113,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_SMP p->wake_entry.u_flags = CSD_TYPE_TTWU; #endif +#ifdef CONFIG_BPF_SCHED + p->tag = 0; +#endif }
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
Add user interface of task tag, bridges the information gap between user-mode and kernel mode.
To do: /proc/${pid}/task/${pid}/tag
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com --- fs/proc/base.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+)
diff --git a/fs/proc/base.c b/fs/proc/base.c index b9052be86e8d..8ae7c2be70c2 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3644,6 +3644,68 @@ static const struct inode_operations proc_tid_comm_inode_operations = { .permission = proc_tid_comm_permission, };
+#ifdef CONFIG_BPF_SCHED +static ssize_t pid_tag_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *tsk; + char buffer[PROC_NUMBUF]; + int err = 0, tag = 0; + + tsk = get_proc_task(inode); + if (!tsk) + return -ESRCH; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &tag); + if (err) + goto out; + + sched_settag(tsk, tag); + +out: + put_task_struct(tsk); + return err < 0 ? err : count; +} + +static int pid_tag_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *tsk; + + tsk = get_proc_task(inode); + if (!tsk) + return -ESRCH; + + seq_printf(m, "%ld\n", tsk->tag); + put_task_struct(tsk); + + return 0; +} + +static int pid_tag_open(struct inode *inode, struct file *flip) +{ + return single_open(flip, pid_tag_show, inode); +} + +static const struct file_operations proc_pid_tag_operations = { + .open = pid_tag_open, + .read = seq_read, + .write = pid_tag_write, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + /* * Tasks */ @@ -3751,6 +3813,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_ASCEND_SHARE_POOL ONE("sp_group", 0444, proc_sp_group_state), #endif +#ifdef CONFIG_BPF_SCHED + REG("tag", 0644, proc_pid_tag_operations), +#endif };
static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
From: Ren Zhijie renzhijie2@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
This helper function read the task group tag for a task. The bpf prog obtains the tags to detect different workloads.
Signed-off-by: Ren Zhijie renzhijie2@huawei.com Signed-off-by: Chen Hui judy.chenhui@huawei.com --- include/uapi/linux/bpf.h | 9 +++++++++ kernel/bpf/helpers.c | 3 +++ kernel/bpf/verifier.c | 4 ++-- kernel/sched/bpf_sched.c | 23 +++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 2 ++ tools/include/uapi/linux/bpf.h | 9 +++++++++ 6 files changed, 48 insertions(+), 2 deletions(-)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d5fbbc28b6a0..c9b13364135d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3777,6 +3777,14 @@ union bpf_attr { * to be enabled. * Return * 1 if the sched entity belongs to a cgroup, 0 otherwise. + * + * long bpf_sched_tg_tag_of(struct task_group *tg) + * Description + * Return task group tag of *tg* if CONFIG_CGROUP_SCHED enabled. + * The bpf prog obtains the tags to detect different workloads. + * Return + * Task group tag, if CONFIG_CGROUP_SCHED enabled, 0 as default tag, or + * a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3940,6 +3948,7 @@ union bpf_attr { FN(sched_entity_to_tgidpid), \ FN(sched_entity_to_cgrpid), \ FN(sched_entity_belongs_to_cgrp), \ + FN(sched_tg_tag_of), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 4bb5921a7d21..0b3bd94ec195 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -658,6 +658,7 @@ const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; +const struct bpf_func_proto bpf_sched_tg_tag_of_proto __weak;
const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) @@ -697,6 +698,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ringbuf_discard_proto; case BPF_FUNC_ringbuf_query: return &bpf_ringbuf_query_proto; + case BPF_FUNC_sched_tg_tag_of: + return &bpf_sched_tg_tag_of_proto; default: break; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d26104b258ba..7acc2cd0081f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5020,10 +5020,10 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn) int i;
for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) { - if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) + if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) return false;
- if (fn->arg_type[i] != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i]) + if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i]) return false; }
diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 2ce2afcacb17..2eedbf84f66f 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -112,6 +112,29 @@ static const struct bpf_func_proto bpf_sched_entity_belongs_to_cgrp_proto = { .arg2_type = ARG_ANYTHING, };
+BPF_CALL_1(bpf_sched_tg_tag_of, struct task_group *, tg) +{ + int ret = 0; + +#ifdef CONFIG_CGROUP_SCHED + if (tg == NULL) + return -EINVAL; + ret = tg->tag; +#endif + + return ret; +} + +BTF_ID_LIST_SINGLE(btf_sched_tg_ids, struct, task_group) + +const struct bpf_func_proto bpf_sched_tg_tag_of_proto = { + .func = bpf_sched_tg_tag_of, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_tg_ids[0], +}; + static const struct bpf_func_proto * bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index be21512ee7be..f8a778ac9ce1 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -436,6 +436,7 @@ class PrinterHelpers(Printer): 'struct path', 'struct btf_ptr', 'struct sched_entity', + 'struct task_group', ] known_types = { '...', @@ -480,6 +481,7 @@ class PrinterHelpers(Printer): 'struct path', 'struct btf_ptr', 'struct sched_entity', + 'struct task_group', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b2a0b189b797..1d1759888c29 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3777,6 +3777,14 @@ union bpf_attr { * to be enabled. * Return * 1 if the sched entity belongs to a cgroup, 0 otherwise. + * + * long bpf_sched_tg_tag_of(struct task_group *tg) + * Description + * Return task group tag of *tg* if CONFIG_CGROUP_SCHED enabled. + * The bpf prog obtains the tags to detect different workloads. + * Return + * Task group tag, if CONFIG_CGROUP_SCHED enabled, 0 as default tag, or + * a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3940,6 +3948,7 @@ union bpf_attr { FN(sched_entity_to_tgidpid), \ FN(sched_entity_to_cgrpid), \ FN(sched_entity_belongs_to_cgrp), \ + FN(sched_tg_tag_of), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
This helper function read the tag of the struct task. The bpf prog obtains the tags to detect different workloads.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com --- include/uapi/linux/bpf.h | 8 ++++++++ kernel/bpf/helpers.c | 3 +++ kernel/sched/bpf_sched.c | 17 +++++++++++++++++ tools/include/uapi/linux/bpf.h | 8 ++++++++ 4 files changed, 36 insertions(+)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c9b13364135d..544a730d6763 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3785,6 +3785,13 @@ union bpf_attr { * Return * Task group tag, if CONFIG_CGROUP_SCHED enabled, 0 as default tag, or * a negative error in case of failure. + * + * long bpf_sched_task_tag_of(struct task_struct *tsk) + * Description + * Return task tag of *tsk*.The bpf prog obtains the tags to detect + * different workloads. + * Return + * Task tag, if used, 0 as default tag, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3949,6 +3956,7 @@ union bpf_attr { FN(sched_entity_to_cgrpid), \ FN(sched_entity_belongs_to_cgrp), \ FN(sched_tg_tag_of), \ + FN(sched_task_tag_of), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 0b3bd94ec195..ef2d8cb87807 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -659,6 +659,7 @@ const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; const struct bpf_func_proto bpf_sched_tg_tag_of_proto __weak; +const struct bpf_func_proto bpf_sched_task_tag_of_proto __weak;
const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) @@ -700,6 +701,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ringbuf_query_proto; case BPF_FUNC_sched_tg_tag_of: return &bpf_sched_tg_tag_of_proto; + case BPF_FUNC_sched_task_tag_of: + return &bpf_sched_task_tag_of_proto; default: break; } diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 2eedbf84f66f..4e98ccbd1d97 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -135,6 +135,23 @@ const struct bpf_func_proto bpf_sched_tg_tag_of_proto = { .arg1_btf_id = &btf_sched_tg_ids[0], };
+BPF_CALL_1(bpf_sched_task_tag_of, struct task_struct *, tsk) +{ + if (tsk == NULL) + return -EINVAL; + return tsk->tag; +} + +BTF_ID_LIST_SINGLE(btf_sched_task_ids, struct, task_struct) + +const struct bpf_func_proto bpf_sched_task_tag_of_proto = { + .func = bpf_sched_task_tag_of, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_task_ids[0], +}; + static const struct bpf_func_proto * bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1d1759888c29..b563b8e124a7 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3785,6 +3785,13 @@ union bpf_attr { * Return * Task group tag, if CONFIG_CGROUP_SCHED enabled, 0 as default tag, or * a negative error in case of failure. + * + * long bpf_sched_task_tag_of(struct task_struct *tsk) + * Description + * Return task tag of *tsk*.The bpf prog obtains the tags to detect + * different workloads. + * Return + * Task tag, if used, 0 as default tag, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3949,6 +3956,7 @@ union bpf_attr { FN(sched_entity_to_cgrpid), \ FN(sched_entity_belongs_to_cgrp), \ FN(sched_tg_tag_of), \ + FN(sched_task_tag_of), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
Add three helper functions: 1) bpf_sched_entity_is_task is to check whether the sched entity is a task struct. 2) bpf_sched_entity_to_task is to change the sched entity to a task struct. 3) bpf_sched_entity_to_tg is to change the sched entity to a task group.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com --- include/uapi/linux/bpf.h | 21 ++++++++++++ kernel/sched/bpf_sched.c | 61 ++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 21 ++++++++++++ 3 files changed, 103 insertions(+)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 544a730d6763..3b24912f2f7f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3792,6 +3792,24 @@ union bpf_attr { * different workloads. * Return * Task tag, if used, 0 as default tag, or a negative error in case of failure. + * + * long bpf_sched_entity_is_task(struct sched_entity *se) + * Description + * Checks whether the sched entity is a task. + * Return + * 1 if true, 0 otherwise. + * + * struct task_struct *bpf_sched_entity_to_task(struct sched_entity *se) + * Description + * Return task struct of *se* if se is a task. + * Return + * Task struct if se is a task, NULL otherwise. + * + * struct task_group *bpf_sched_entity_to_tg(struct sched_entity *se) + * Description + * Return task group of *se* if se is a task group. + * Return + * Task struct if se is a task group, NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3957,6 +3975,9 @@ union bpf_attr { FN(sched_entity_belongs_to_cgrp), \ FN(sched_tg_tag_of), \ FN(sched_task_tag_of), \ + FN(sched_entity_is_task), \ + FN(sched_entity_to_task), \ + FN(sched_entity_to_tg), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 4e98ccbd1d97..295f3ed02c27 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -152,6 +152,61 @@ const struct bpf_func_proto bpf_sched_task_tag_of_proto = { .arg1_btf_id = &btf_sched_task_ids[0], };
+BPF_CALL_1(bpf_sched_entity_is_task, struct sched_entity *, se) +{ + return entity_is_task(se) ? 1 : 0; +} + +static const struct bpf_func_proto bpf_sched_entity_is_task_proto = { + .func = bpf_sched_entity_is_task, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_entity_ids[0], +}; + +BPF_CALL_1(bpf_sched_entity_to_task, struct sched_entity *, se) +{ + if (entity_is_task(se)) { + struct task_struct *tsk = task_of(se); + + return (unsigned long)tsk; + } + + return (unsigned long)NULL; +} + +static const struct bpf_func_proto bpf_sched_entity_to_task_proto = { + .func = bpf_sched_entity_to_task, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .ret_btf_id = &btf_sched_task_ids[0], + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_entity_ids[0], +}; + +BPF_CALL_1(bpf_sched_entity_to_tg, struct sched_entity *, se) +{ +#if CONFIG_FAIR_GROUP_SCHED + if (!entity_is_task(se)) { + struct task_group *tg = group_cfs_rq(se)->tg; + + return (unsigned long)tg; + } +#endif + + return (unsigned long)NULL; +} + +static const struct bpf_func_proto bpf_sched_entity_to_tg_proto = { + .func = bpf_sched_entity_to_tg, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .ret_btf_id = &btf_sched_tg_ids[0], + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_entity_ids[0], +}; + static const struct bpf_func_proto * bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -164,6 +219,12 @@ bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sched_entity_to_cgrpid_proto; case BPF_FUNC_sched_entity_belongs_to_cgrp: return &bpf_sched_entity_belongs_to_cgrp_proto; + case BPF_FUNC_sched_entity_is_task: + return &bpf_sched_entity_is_task_proto; + case BPF_FUNC_sched_entity_to_task: + return &bpf_sched_entity_to_task_proto; + case BPF_FUNC_sched_entity_to_tg: + return &bpf_sched_entity_to_tg_proto; default: return bpf_base_func_proto(func_id); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b563b8e124a7..9bf63928487d 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3792,6 +3792,24 @@ union bpf_attr { * different workloads. * Return * Task tag, if used, 0 as default tag, or a negative error in case of failure. + * + * long bpf_sched_entity_is_task(struct sched_entity *se) + * Description + * Checks whether the sched entity is a task. + * Return + * 1 if true, 0 otherwise. + * + * struct task_struct *bpf_sched_entity_to_task(struct sched_entity *se) + * Description + * Return task struct of *se* if se is a task. + * Return + * Task struct if se is a task, NULL otherwise. + * + * struct task_group *bpf_sched_entity_to_tg(struct sched_entity *se) + * Description + * Return task group of *se* if se is a task group. + * Return + * Task struct if se is a task group, NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3957,6 +3975,9 @@ union bpf_attr { FN(sched_entity_belongs_to_cgrp), \ FN(sched_tg_tag_of), \ FN(sched_task_tag_of), \ + FN(sched_entity_is_task), \ + FN(sched_entity_to_task), \ + FN(sched_entity_to_tg), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
Add support for section names starting with sched/.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com --- samples/bpf/bpf_load.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index c5ad528f046e..4dfb45d254b1 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -87,6 +87,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) bool is_sockops = strncmp(event, "sockops", 7) == 0; bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0; bool is_sk_msg = strncmp(event, "sk_msg", 6) == 0; + bool is_sched = strncmp(event, "sched/", 6) == 0; size_t insns_cnt = size / sizeof(struct bpf_insn); enum bpf_prog_type prog_type; char buf[256]; @@ -120,6 +121,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) prog_type = BPF_PROG_TYPE_SK_SKB; } else if (is_sk_msg) { prog_type = BPF_PROG_TYPE_SK_MSG; + } else if (is_sched) { + prog_type = BPF_PROG_TYPE_SCHED; } else { printf("Unknown event '%s'\n", event); return -1; @@ -137,7 +140,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
prog_fd[prog_cnt++] = fd;
- if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk) + if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk || is_sched) return 0;
if (is_socket || is_sockops || is_sk_skb || is_sk_msg) { @@ -643,7 +646,8 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) memcmp(shname, "cgroup/", 7) == 0 || memcmp(shname, "sockops", 7) == 0 || memcmp(shname, "sk_skb", 6) == 0 || - memcmp(shname, "sk_msg", 6) == 0) { + memcmp(shname, "sk_msg", 6) == 0 || + memcmp(shname, "sched/", 6) == 0) { ret = load_and_attach(shname, data->d_buf, data->d_size); if (ret != 0)
From: Ren Zhijie renzhijie2@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
This program can choose program by args [-W|-T|-E] to attach BPF_SCHED hooks which in preempt process and then matches task or task group which has tag TASK_TYPE_ONLINE and TASK_TYPE_OFFLINE.
This will return different value to hook function which use to indicate whether to preempt the current sched entity.
To run,
# sched_preempt [-W|-T|-E|-h] # USAGE: sched_preempt [...] # -W # Test sched preempt wakeup # -T # Test sched preempt tick # -E # Test wakeup preempt entity
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com --- samples/bpf/Makefile | 3 + samples/bpf/sched_preempt_kern.c | 147 +++++++++++++++++++++++++++++++ samples/bpf/sched_preempt_user.c | 140 +++++++++++++++++++++++++++++ 3 files changed, 290 insertions(+) create mode 100644 samples/bpf/sched_preempt_kern.c create mode 100644 samples/bpf/sched_preempt_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index aeebf5d12f32..e473bad76549 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -54,6 +54,7 @@ tprogs-y += task_fd_query tprogs-y += xdp_sample_pkts tprogs-y += ibumad tprogs-y += hbm +tprogs-y += sched_preempt
# Libbpf dependencies LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a @@ -111,6 +112,7 @@ task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS) xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS) ibumad-objs := bpf_load.o ibumad_user.o $(TRACE_HELPERS) hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS) +sched_preempt-objs := sched_preempt_user.o
# Tell kbuild to always build the programs always-y := $(tprogs-y) @@ -172,6 +174,7 @@ always-y += ibumad_kern.o always-y += hbm_out_kern.o always-y += hbm_edt_kern.o always-y += xdpsock_kern.o +always-y += sched_preempt_kern.o
ifeq ($(ARCH), arm) # Strip all except -D__LINUX_ARM_ARCH__ option needed to handle linux diff --git a/samples/bpf/sched_preempt_kern.c b/samples/bpf/sched_preempt_kern.c new file mode 100644 index 000000000000..788883f72deb --- /dev/null +++ b/samples/bpf/sched_preempt_kern.c @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#include <linux/version.h> +#include <linux/sched.h> +#include <uapi/linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +unsigned long idea_runtime = 1000000UL; + +enum task_type { + TASK_TYPE_OFFLINE = -1, + TASK_TYPE_NORMAL, + TASK_TYPE_ONLINE, +}; + +#define getVal(P) \ + ({ \ + typeof(P) val = 0; \ + bpf_probe_read_kernel(&val, sizeof(val), &(P)); \ + val; \ + }) + +#define bprintk(fmt, ...) \ + ({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ + }) + +SEC("sched/cfs_check_preempt_wakeup") +int BPF_PROG(sched_cfs_check_preempt_wakeup, struct task_struct *curr, struct task_struct *p) +{ + long curr_type, p_type; + int ret = 0; + + curr_type = bpf_sched_task_tag_of(curr); + p_type = bpf_sched_task_tag_of(p); + + if (curr_type == TASK_TYPE_ONLINE && p_type == TASK_TYPE_OFFLINE) + ret = -1; + + if (curr_type == TASK_TYPE_OFFLINE && p_type == TASK_TYPE_ONLINE) + ret = 1; + + bprintk("check_preempt_wakeup: curr id = %d, p id = %d, preempt result is %d\n", + getVal(curr->pid), getVal(p->pid), ret); + + return ret; +} + +SEC("sched/cfs_check_preempt_tick") +int BPF_PROG(sched_cfs_check_preempt_tick, struct sched_entity *curr, unsigned long delta_exec) +{ + long curr_type = TASK_TYPE_NORMAL; + int ret = 0, id = 0; + int entity_is_task = bpf_sched_entity_is_task(curr); + + if (entity_is_task) { + struct task_struct *tsk = bpf_sched_entity_to_task(curr); + + if (tsk) { + curr_type = bpf_sched_task_tag_of(tsk); + id = getVal(tsk->pid); + } + } else { + struct task_group *tg = bpf_sched_entity_to_tg(curr); + + if (tg) { + curr_type = bpf_sched_tg_tag_of(tg); + id = bpf_sched_entity_to_cgrpid(curr); + } + } + + if (curr_type == TASK_TYPE_ONLINE) + ret = delta_exec >= idea_runtime ? 1 : -1; + + bprintk("check_preempt_tick: delta = %lu, entity id = %d, preempt result = %d\n", + delta_exec, id, ret); + return ret; +} + +SEC("sched/cfs_wakeup_preempt_entity") +int BPF_PROG(sched_cfs_wakeup_preempt_entity, struct sched_entity *curr, struct sched_entity *se) +{ + long curr_type = TASK_TYPE_NORMAL; + long p_type = TASK_TYPE_NORMAL; + int curr_id = 0, p_id = 0; + int curr_is_task = bpf_sched_entity_is_task(curr); + int p_is_task = bpf_sched_entity_is_task(se); + int ret = 0; + + if (curr_is_task) { + struct task_struct *tsk = bpf_sched_entity_to_task(curr); + + if (tsk) { + curr_type = bpf_sched_task_tag_of(tsk); + curr_id = getVal(tsk->pid); + } + } else { + struct task_group *tg = bpf_sched_entity_to_tg(curr); + + if (tg) { + curr_type = bpf_sched_tg_tag_of(tg); + curr_id = bpf_sched_entity_to_cgrpid(curr); + } + } + + if (p_is_task) { + struct task_struct *p = bpf_sched_entity_to_task(se); + + if (p) { + p_type = bpf_sched_task_tag_of(p); + p_id = getVal(p->pid); + } + } else { + struct task_group *tg1 = bpf_sched_entity_to_tg(se); + + if (tg1) { + p_type = bpf_sched_tg_tag_of(tg1); + p_id = bpf_sched_entity_to_cgrpid(se); + } + } + + if (curr_type == TASK_TYPE_ONLINE && p_type == TASK_TYPE_OFFLINE) + ret = -1; + + if (curr_type == TASK_TYPE_OFFLINE && p_type == TASK_TYPE_ONLINE) + ret = 1; + + bprintk("wakeup_preempt_entity: curr entity id = %d, se entity id = %d, result = %d\n", + curr_id, p_id, ret); + return ret; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/sched_preempt_user.c b/samples/bpf/sched_preempt_user.c new file mode 100644 index 000000000000..92e64d04b687 --- /dev/null +++ b/samples/bpf/sched_preempt_user.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <unistd.h> +#include <sys/resource.h> +#include <bpf/libbpf.h> + +#define MAX_PROGS (3) +#define TRACE_DIR "/sys/kernel/debug/tracing/" +#define BUF_SIZE (4096) + +int progindex[MAX_PROGS]; + +static void usage(void) +{ + printf("USAGE: sched_preempt [...]\n"); + printf(" -W # Test sched preempt wakeup\n"); + printf(" -T # Test sched preempt tick\n"); + printf(" -E # Test wakeup preempt entity\n"); + printf(" -h # Display this help\n"); +} + +/* read trace logs from debug fs */ +static void read_trace_pipe(void) +{ + int trace_fd; + + trace_fd = open(TRACE_DIR "trace_pipe", O_RDONLY, 0); + if (trace_fd < 0) + return; + + while (1) { + static char buf[BUF_SIZE]; + ssize_t sz; + + sz = read(trace_fd, buf, sizeof(buf) - 1); + if (sz > 0) { + buf[sz] = 0; + puts(buf); + } + } +} + +static inline bool check_attach_prog(int index) +{ + return progindex[index] ? true : false; +} + +int main(int argc, char **argv) +{ + int opt; + int index; + char filename[256]; + struct bpf_object *obj; + struct bpf_program *prog; + struct bpf_link *link[3] = {NULL}; + + char prognames[MAX_PROGS][256] = { + "sched_cfs_check_preempt_wakeup", + "sched_cfs_check_preempt_tick", + "sched_cfs_wakeup_preempt_entity", + }; + + while ((opt = getopt(argc, argv, "WTEh")) != -1) { + switch (opt) { + case 'W': + progindex[0] = 1; + break; + case 'T': + progindex[1] = 1; + break; + case 'E': + progindex[2] = 1; + break; + case 'h': + default: + usage(); + goto out; + } + } + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + goto out; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + for (index = 0; index < MAX_PROGS; ++index) { + if (check_attach_prog(index)) { + prog = bpf_object__find_program_by_name(obj, prognames[index]); + if (libbpf_get_error(prog)) { + fprintf(stderr, "ERROR: finding a prog:%s in obj file failed\n", + prognames[index]); + goto cleanup; + } + + link[index] = bpf_program__attach(prog); + if (libbpf_get_error(link[index])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link[index] = NULL; + goto cleanup; + } + } + } + + printf("preempt BPF started, hit Ctrl+C to stop!\n"); + + read_trace_pipe(); + +cleanup: + for (index = MAX_PROGS - 1; index >= 0; index--) + bpf_link__destroy(link[index]); + bpf_object__close(obj); + +out: + return 0; +}
From: Ren Zhijie renzhijie2@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
Add helper function bpf_sched_set_tg_tag() and bpf_sched_set_task_tag() to set tag for task group or task.
They can not be call when rq->lock has been held.
The use case is that the other kernel subsystems, such as the network, can use it to mark key tasks.
Signed-off-by: Ren Zhijie renzhijie2@huawei.com --- include/uapi/linux/bpf.h | 14 ++++++++++++ kernel/bpf/helpers.c | 6 +++++ kernel/sched/bpf_sched.c | 41 ++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 14 ++++++++++++ 4 files changed, 75 insertions(+)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3b24912f2f7f..39e69583e8ff 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3810,6 +3810,18 @@ union bpf_attr { * Return task group of *se* if se is a task group. * Return * Task struct if se is a task group, NULL otherwise. + * + * int bpf_sched_set_tg_tag(struct task_group *tg, s64 tag) + * Description + * Set tag to *tg* and its descendants. + * Return + * 0 on success, or a negative error in case of failure. + * + * void bpf_sched_set_task_tag(struct task_struct *tsk, s64 tag) + * Description + * Set tag to *tsk*. + * Return + * Nothing. Always succeeds. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3978,6 +3990,8 @@ union bpf_attr { FN(sched_entity_is_task), \ FN(sched_entity_to_task), \ FN(sched_entity_to_tg), \ + FN(sched_set_tg_tag), \ + FN(sched_set_task_tag), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index ef2d8cb87807..5fccf33196b5 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -660,6 +660,8 @@ const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; const struct bpf_func_proto bpf_sched_tg_tag_of_proto __weak; const struct bpf_func_proto bpf_sched_task_tag_of_proto __weak; +const struct bpf_func_proto bpf_sched_set_tg_tag_proto __weak; +const struct bpf_func_proto bpf_sched_set_task_tag_proto __weak;
const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) @@ -721,6 +723,10 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_per_cpu_ptr_proto; case BPF_FUNC_this_cpu_ptr: return &bpf_this_cpu_ptr_proto; + case BPF_FUNC_sched_set_tg_tag: + return &bpf_sched_set_tg_tag_proto; + case BPF_FUNC_sched_set_task_tag: + return &bpf_sched_set_task_tag_proto; default: break; } diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 295f3ed02c27..4446bb702c30 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -207,6 +207,47 @@ static const struct bpf_func_proto bpf_sched_entity_to_tg_proto = { .arg1_btf_id = &btf_sched_entity_ids[0], };
+BPF_CALL_2(bpf_sched_set_tg_tag, struct task_group *, tg, s64, tag) +{ +#if CONFIG_CGROUP_SCHED + if (tg == &root_task_group) + return -EINVAL; + + if (tg->tag == tag) + return 0; + + rcu_read_lock(); + walk_tg_tree_from(tg, tg_change_tag, tg_nop, (void *)(&tag)); + rcu_read_unlock(); + + return 0; +#endif + return -EPERM; +} + +const struct bpf_func_proto bpf_sched_set_tg_tag_proto = { + .func = bpf_sched_set_tg_tag, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_tg_ids[0], + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_sched_set_task_tag, struct task_struct *, tsk, s64, tag) +{ + sched_settag(tsk, tag); + return 0; +} + +const struct bpf_func_proto bpf_sched_set_task_tag_proto = { + .func = bpf_sched_set_task_tag, + .gpl_only = false, + .ret_type = RET_VOID, + .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_task_ids[0], + .arg2_type = ARG_ANYTHING, +}; static const struct bpf_func_proto * bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 9bf63928487d..8844f900bf83 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3810,6 +3810,18 @@ union bpf_attr { * Return task group of *se* if se is a task group. * Return * Task struct if se is a task group, NULL otherwise. + * + * int bpf_sched_set_tg_tag(struct task_group *tg, s64 tag) + * Description + * Set tag to *tg* and its descendants. + * Return + * 0 on success, or a negative error in case of failure. + * + * void bpf_sched_set_task_tag(struct task_struct *tsk, s64 tag) + * Description + * Set tag to *tsk*. + * Return + * Nothing. Always succeeds. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3978,6 +3990,8 @@ union bpf_attr { FN(sched_entity_is_task), \ FN(sched_entity_to_task), \ FN(sched_entity_to_tg), \ + FN(sched_set_tg_tag), \ + FN(sched_set_task_tag), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
Add four helper functions to get cpu stat, as follows: 1.acquire cfs/rt/irq cpu load statitic. 2.acquire multiple types of nr_running statitic. 3.acquire cpu idle statitic. 4.acquire cpu capacity.
Based on CPU statistics in different dimensions, specific scheduling policies can be implemented in bpf program.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com --- include/linux/sched.h | 33 +++++++ include/uapi/linux/bpf.h | 28 ++++++ kernel/sched/bpf_sched.c | 155 +++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 8 ++ tools/include/uapi/linux/bpf.h | 28 ++++++ 5 files changed, 252 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h index d6db602da068..c969af754263 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2185,5 +2185,38 @@ static inline int sched_qos_cpu_overload(void)
#ifdef CONFIG_BPF_SCHED extern void sched_settag(struct task_struct *tsk, s64 tag); + +struct bpf_sched_cpu_load { + unsigned long cfs_load_avg; + unsigned long cfs_runnable_avg; + unsigned long cfs_util_avg; + unsigned long rt_load_avg; + unsigned long rt_runnable_avg; + unsigned long rt_util_avg; + unsigned long irq_load_avg; + unsigned long irq_runnable_avg; + unsigned long irq_util_avg; +}; + +struct bpf_sched_cpu_nr_running { + unsigned int nr_running; + unsigned int cfs_nr_running; + unsigned int cfs_h_nr_running; + unsigned int cfs_idle_h_nr_running; + unsigned int rt_nr_running; + unsigned int rr_nr_running; +}; + +struct bpf_sched_cpu_idle_stat { + int available_idle; + unsigned int exit_latency; + unsigned long idle_stamp; + unsigned long avg_idle; +}; + +struct bpf_sched_cpu_capacity { + unsigned long capacity; + unsigned long capacity_orig; +}; #endif #endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 39e69583e8ff..b898cae70b0a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3822,6 +3822,30 @@ union bpf_attr { * Set tag to *tsk*. * Return * Nothing. Always succeeds. + * + * int bpf_sched_cpu_load_of(int cpu, struct bpf_sched_cpu_load *ctx, int len) + * Description + * Get multiple types of *cpu* load and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_cpu_nr_running_of(int cpu, struct bpf_sched_cpu_nr_running *ctx, int len) + * Description + * Get multiple types of *cpu* nr running and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_cpu_idle_stat_of(int cpu, struct bpf_sched_cpu_idle_stat *ctx, int len) + * Description + * Get *cpu* idle state and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_cpu_capacity_of(int cpu, struct bpf_sched_cpu_capacity *ctx, int len) + * Description + * Get *cpu* capacity and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3992,6 +4016,10 @@ union bpf_attr { FN(sched_entity_to_tg), \ FN(sched_set_tg_tag), \ FN(sched_set_task_tag), \ + FN(sched_cpu_load_of), \ + FN(sched_cpu_nr_running_of), \ + FN(sched_cpu_idle_stat_of), \ + FN(sched_cpu_capacity_of), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 4446bb702c30..db2ca47f2937 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -248,6 +248,153 @@ const struct bpf_func_proto bpf_sched_set_task_tag_proto = { .arg1_btf_id = &btf_sched_task_ids[0], .arg2_type = ARG_ANYTHING, }; + +BPF_CALL_3(bpf_sched_cpu_load_of, int, cpu, + struct bpf_sched_cpu_load *, ctx, + int, len) +{ + struct rq *rq; + + if (len != sizeof(*ctx)) + return -EINVAL; + + if ((unsigned int)cpu >= nr_cpu_ids) + return -EINVAL; + + memset(ctx, 0, sizeof(struct bpf_sched_cpu_load)); +#ifdef CONFIG_SMP + rq = cpu_rq(cpu); + SCHED_WARN_ON(!rcu_read_lock_held()); + ctx->cfs_load_avg = rq->cfs.avg.load_avg; + ctx->cfs_runnable_avg = rq->cfs.avg.runnable_avg; + ctx->cfs_util_avg = rq->cfs.avg.util_avg; + ctx->rt_load_avg = rq->avg_rt.load_avg; + ctx->rt_runnable_avg = rq->avg_rt.runnable_avg; + ctx->rt_util_avg = rq->avg_rt.util_avg; +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ + ctx->irq_load_avg = rq->avg_irq.load_avg; + ctx->irq_runnable_avg = rq->avg_irq.runnable_avg; + ctx->irq_util_avg = rq->avg_irq.util_avg; +#endif +#endif + + return 0; +} + +static const struct bpf_func_proto bpf_sched_cpu_load_of_proto = { + .func = bpf_sched_cpu_load_of, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +BPF_CALL_3(bpf_sched_cpu_nr_running_of, int, cpu, + struct bpf_sched_cpu_nr_running *, ctx, + int, len) +{ + struct rq *rq; + + if (len != sizeof(*ctx)) + return -EINVAL; + + if ((unsigned int)cpu >= nr_cpu_ids) + return -EINVAL; + + SCHED_WARN_ON(!rcu_read_lock_held()); + + rq = cpu_rq(cpu); + ctx->nr_running = rq->nr_running; + ctx->cfs_nr_running = rq->cfs.nr_running; + ctx->cfs_h_nr_running = rq->cfs.h_nr_running; + ctx->cfs_idle_h_nr_running = rq->cfs.idle_h_nr_running; + ctx->rt_nr_running = rq->rt.rt_nr_running; + ctx->rr_nr_running = rq->rt.rr_nr_running; + + return 0; +} + +static const struct bpf_func_proto bpf_sched_cpu_nr_running_of_proto = { + .func = bpf_sched_cpu_nr_running_of, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +BPF_CALL_3(bpf_sched_cpu_idle_stat_of, int, cpu, + struct bpf_sched_cpu_idle_stat *, ctx, + int, len) +{ + struct cpuidle_state *idle; + struct rq *rq; + + if (len != sizeof(*ctx)) + return -EINVAL; + + if ((unsigned int)cpu >= nr_cpu_ids) + return -EINVAL; + + memset(ctx, 0, sizeof(struct bpf_sched_cpu_idle_stat)); + SCHED_WARN_ON(!rcu_read_lock_held()); + ctx->available_idle = available_idle_cpu(cpu); + rq = cpu_rq(cpu); + idle = idle_get_state(rq); + if (idle) + ctx->exit_latency = idle->exit_latency; + +#ifdef CONFIG_SMP + ctx->idle_stamp = rq->idle_stamp; + ctx->avg_idle = rq->avg_idle; +#endif + + return 0; +} + +static const struct bpf_func_proto bpf_sched_cpu_idle_stat_of_proto = { + .func = bpf_sched_cpu_idle_stat_of, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +BPF_CALL_3(bpf_sched_cpu_capacity_of, int, cpu, + struct bpf_sched_cpu_capacity *, ctx, + int, len) +{ + struct rq *rq; + + if (len != sizeof(*ctx)) + return -EINVAL; + + if ((unsigned int)cpu >= nr_cpu_ids) + return -EINVAL; + + memset(ctx, 0, sizeof(struct bpf_sched_cpu_capacity)); +#ifdef CONFIG_SMP + SCHED_WARN_ON(!rcu_read_lock_held()); + rq = cpu_rq(cpu); + ctx->capacity = rq->cpu_capacity; + ctx->capacity_orig = rq->cpu_capacity_orig; +#endif + + return 0; +} + +static const struct bpf_func_proto bpf_sched_cpu_capacity_of_proto = { + .func = bpf_sched_cpu_capacity_of, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + + static const struct bpf_func_proto * bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -266,6 +413,14 @@ bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sched_entity_to_task_proto; case BPF_FUNC_sched_entity_to_tg: return &bpf_sched_entity_to_tg_proto; + case BPF_FUNC_sched_cpu_load_of: + return &bpf_sched_cpu_load_of_proto; + case BPF_FUNC_sched_cpu_nr_running_of: + return &bpf_sched_cpu_nr_running_of_proto; + case BPF_FUNC_sched_cpu_idle_stat_of: + return &bpf_sched_cpu_idle_stat_of_proto; + case BPF_FUNC_sched_cpu_capacity_of: + return &bpf_sched_cpu_capacity_of_proto; default: return bpf_base_func_proto(func_id); } diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index f8a778ac9ce1..f2b5e63801ca 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -437,6 +437,10 @@ class PrinterHelpers(Printer): 'struct btf_ptr', 'struct sched_entity', 'struct task_group', + 'struct bpf_sched_cpu_load', + 'struct bpf_sched_cpu_nr_running', + 'struct bpf_sched_cpu_idle_stat', + 'struct bpf_sched_cpu_capacity', ] known_types = { '...', @@ -482,6 +486,10 @@ class PrinterHelpers(Printer): 'struct btf_ptr', 'struct sched_entity', 'struct task_group', + 'struct bpf_sched_cpu_load', + 'struct bpf_sched_cpu_nr_running', + 'struct bpf_sched_cpu_idle_stat', + 'struct bpf_sched_cpu_capacity', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 8844f900bf83..97295cd863c4 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3822,6 +3822,30 @@ union bpf_attr { * Set tag to *tsk*. * Return * Nothing. Always succeeds. + * + * int bpf_sched_cpu_load_of(int cpu, struct bpf_sched_cpu_load *ctx, int len) + * Description + * Get multiple types of *cpu* load and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_cpu_nr_running_of(int cpu, struct bpf_sched_cpu_nr_running *ctx, int len) + * Description + * Get multiple types of *cpu* nr running and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_cpu_idle_stat_of(int cpu, struct bpf_sched_cpu_idle_stat *ctx, int len) + * Description + * Get *cpu* idle state and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_cpu_capacity_of(int cpu, struct bpf_sched_cpu_capacity *ctx, int len) + * Description + * Get *cpu* capacity and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3992,6 +4016,10 @@ union bpf_attr { FN(sched_entity_to_tg), \ FN(sched_set_tg_tag), \ FN(sched_set_task_tag), \ + FN(sched_cpu_load_of), \ + FN(sched_cpu_nr_running_of), \ + FN(sched_cpu_idle_stat_of), \ + FN(sched_cpu_capacity_of), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
From: Ren Zhijie renzhijie2@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
Add bpf helper function bpf_init_cpu_topology() which obtains cpu topology info through the macros topology_* that are defined by include/linux/topology.h, and save it in BPF MAP.
The cpu topology info are useful to select core in userspace.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com --- include/linux/bpf_topology.h | 46 ++++++++++++++++ include/uapi/linux/bpf.h | 14 +++++ kernel/sched/Makefile | 3 +- kernel/sched/bpf_sched.c | 8 +++ kernel/sched/bpf_topology.c | 97 ++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 4 ++ tools/include/uapi/linux/bpf.h | 14 +++++ 7 files changed, 185 insertions(+), 1 deletion(-) create mode 100644 include/linux/bpf_topology.h create mode 100644 kernel/sched/bpf_topology.c
diff --git a/include/linux/bpf_topology.h b/include/linux/bpf_topology.h new file mode 100644 index 000000000000..0c7ee492edde --- /dev/null +++ b/include/linux/bpf_topology.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _LINUX_BPF_TOPOLOGY_H +#define _LINUX_BPF_TOPOLOGY_H + +#include <linux/cpumask.h> + +struct bpf_cpu_topology { + int cpu; + int core_id; + int cluster_id; + int die_id; + int physical_package_id; + int numa_node; + struct cpumask thread_siblings; + struct cpumask core_siblings; + struct cpumask cluster_cpus; + struct cpumask die_cpus; + struct cpumask package_cpus; + struct cpumask node_cpu_lists; +}; + +struct bpf_cpumask_info { + unsigned int nums_possible_cpus; + unsigned int nums_active_cpus; + unsigned int nums_isolate_cpus; + unsigned int nr_cpu_ids; + unsigned int bpf_nr_cpumask_bits; + struct cpumask cpu_possible_cpumask; + struct cpumask cpu_active_cpumask; + struct cpumask cpu_isolate_cpumask; +}; + +#endif /* _LINUX_BPF_TOPOLOGY_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b898cae70b0a..06ae8a7f9ef3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3846,6 +3846,18 @@ union bpf_attr { * Get *cpu* capacity and store in *ctx*. * Return * 0 on success, or a negative error in case of failure. + * + * long bpf_init_cpu_topology(struct bpf_map *map, u64 flags) + * Description + * Initializing the cpu topology which used for bpf prog. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_get_cpumask_info(struct bpf_cpumask_info *cpus, int len) + * Description + * Get system cpus returned in *cpus*. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4020,6 +4032,8 @@ union bpf_attr { FN(sched_cpu_nr_running_of), \ FN(sched_cpu_idle_stat_of), \ FN(sched_cpu_capacity_of), \ + FN(init_cpu_topology), \ + FN(get_cpumask_info), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 8ae9e39eb83a..c809d5c28424 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -36,4 +36,5 @@ obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_CPU_ISOLATION) += isolation.o obj-$(CONFIG_PSI) += psi.o -obj-$(CONFIG_BPF_SCHED) += bpf_sched.o \ No newline at end of file +obj-$(CONFIG_BPF_SCHED) += bpf_sched.o +obj-$(CONFIG_BPF_SCHED) += bpf_topology.o \ No newline at end of file diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index db2ca47f2937..6f2200170093 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -4,6 +4,7 @@ #include <linux/bpf_verifier.h> #include <linux/bpf_sched.h> #include <linux/btf_ids.h> +#include <linux/bpf_topology.h> #include "sched.h"
DEFINE_STATIC_KEY_FALSE(bpf_sched_enabled_key); @@ -26,6 +27,9 @@ BTF_SET_START(bpf_sched_hooks) #undef BPF_SCHED_HOOK BTF_SET_END(bpf_sched_hooks)
+const struct bpf_func_proto bpf_init_cpu_topology_proto __weak; +const struct bpf_func_proto bpf_get_cpumask_info_proto __weak; + int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) { @@ -421,6 +425,10 @@ bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sched_cpu_idle_stat_of_proto; case BPF_FUNC_sched_cpu_capacity_of: return &bpf_sched_cpu_capacity_of_proto; + case BPF_FUNC_init_cpu_topology: + return &bpf_init_cpu_topology_proto; + case BPF_FUNC_get_cpumask_info: + return &bpf_get_cpumask_info_proto; default: return bpf_base_func_proto(func_id); } diff --git a/kernel/sched/bpf_topology.c b/kernel/sched/bpf_topology.c new file mode 100644 index 000000000000..9c2eda139e2a --- /dev/null +++ b/kernel/sched/bpf_topology.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/bpf.h> +#include <linux/btf_ids.h> +#include <linux/bpf_verifier.h> +#include <linux/topology.h> +#include <linux/cpumask.h> +#include <linux/bpf_topology.h> +#include <linux/sched/isolation.h> + +static void bpf_update_cpu_topology(struct bpf_cpu_topology *cpu_topology, int cpu) +{ + cpu_topology->cpu = cpu; + cpu_topology->core_id = topology_core_id(cpu); + cpu_topology->cluster_id = topology_cluster_id(cpu); + cpu_topology->die_id = topology_die_id(cpu); + cpu_topology->physical_package_id = topology_physical_package_id(cpu); + cpu_topology->numa_node = cpu_to_node(cpu); + cpumask_copy(&cpu_topology->thread_siblings, topology_sibling_cpumask(cpu)); + cpumask_copy(&cpu_topology->core_siblings, topology_core_cpumask(cpu)); + cpumask_copy(&cpu_topology->cluster_cpus, topology_cluster_cpumask(cpu)); + cpumask_copy(&cpu_topology->die_cpus, topology_die_cpumask(cpu)); + cpumask_copy(&cpu_topology->package_cpus, topology_core_cpumask(cpu)); + cpumask_copy(&cpu_topology->node_cpu_lists, cpumask_of_node(cpu_to_node(cpu))); +} + +BPF_CALL_2(bpf_init_cpu_topology, struct bpf_map *, map, u64, flags) +{ + const struct cpumask *cpu_map = cpu_active_mask; + int ret = 0; + int i = -1; + + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + for_each_cpu(i, cpu_map) { + struct bpf_cpu_topology topo; + + bpf_update_cpu_topology(&topo, i); + ret = map->ops->map_update_elem(map, &i, &topo, flags); + if (ret) { + int idx = i; + + for (; idx >= 0; idx--) + map->ops->map_delete_elem(map, &idx); + break; + } + } + + return ret; +} + +BTF_ID_LIST_SINGLE(bpf_cpu_topology_ids, struct, bpf_cpu_topology) + +const struct bpf_func_proto bpf_init_cpu_topology_proto = { + .func = bpf_init_cpu_topology, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_get_cpumask_info, struct bpf_cpumask_info *, cpus, + int, len) +{ + if (len != sizeof(*cpus)) + return -EINVAL; + + cpumask_copy(&cpus->cpu_possible_cpumask, cpu_possible_mask); + cpumask_copy(&cpus->cpu_active_cpumask, cpu_active_mask); + cpumask_copy(&cpus->cpu_isolate_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN)); + cpus->nums_possible_cpus = num_possible_cpus(); + cpus->nums_active_cpus = num_active_cpus(); + cpus->nums_isolate_cpus = cpumask_weight(&cpus->cpu_isolate_cpumask); + cpus->nr_cpu_ids = nr_cpu_ids; + cpus->bpf_nr_cpumask_bits = nr_cpumask_bits; + + return 0; +} + +const struct bpf_func_proto bpf_get_cpumask_info_proto = { + .func = bpf_get_cpumask_info, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE, +}; diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index f2b5e63801ca..b99981bf62f2 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -441,6 +441,8 @@ class PrinterHelpers(Printer): 'struct bpf_sched_cpu_nr_running', 'struct bpf_sched_cpu_idle_stat', 'struct bpf_sched_cpu_capacity', + 'struct bpf_cpu_topology', + 'struct bpf_cpumask_info', ] known_types = { '...', @@ -490,6 +492,8 @@ class PrinterHelpers(Printer): 'struct bpf_sched_cpu_nr_running', 'struct bpf_sched_cpu_idle_stat', 'struct bpf_sched_cpu_capacity', + 'struct bpf_cpu_topology', + 'struct bpf_cpumask_info', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 97295cd863c4..b3be7de118d5 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3846,6 +3846,18 @@ union bpf_attr { * Get *cpu* capacity and store in *ctx*. * Return * 0 on success, or a negative error in case of failure. + * + * long bpf_init_cpu_topology(struct bpf_map *map, u64 flags) + * Description + * Initializing the cpu topology which used for bpf prog. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_get_cpumask_info(struct bpf_cpumask_info *cpus, int len) + * Description + * Get system cpus returned in *cpus*. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4020,6 +4032,8 @@ union bpf_attr { FN(sched_cpu_nr_running_of), \ FN(sched_cpu_idle_stat_of), \ FN(sched_cpu_capacity_of), \ + FN(init_cpu_topology), \ + FN(get_cpumask_info), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
Add cpumask ops collection, such as cpumask_empty, cpumask_and, cpumask_andnot, cpumask_subset, cpumask_equal, cpumask_copy.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com --- include/linux/sched.h | 23 +++++++++ include/uapi/linux/bpf.h | 43 +++++++++++++++++ kernel/sched/bpf_sched.c | 86 ++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 4 ++ tools/include/uapi/linux/bpf.h | 43 +++++++++++++++++ 5 files changed, 199 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h index c969af754263..1b0cc2bfbd75 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2218,5 +2218,28 @@ struct bpf_sched_cpu_capacity { unsigned long capacity; unsigned long capacity_orig; }; + +struct cpumask_op_args { + unsigned int op_type; + void *arg1; + void *arg2; + void *arg3; + void *arg4; +}; + +enum cpumask_op_type { + CPUMASK_EMPTY, + CPUMASK_AND, + CPUMASK_ANDNOT, + CPUMASK_SUBSET, + CPUMASK_EQUAL, + CPUMASK_TEST_CPU, + CPUMASK_COPY, + CPUMASK_WEIGHT, + CPUMASK_NEXT, + CPUMASK_NEXT_WRAP, + CPUMASK_NEXT_AND, + CPUMASK_CPULIST_PARSE +}; #endif #endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 06ae8a7f9ef3..b48890aef086 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3858,6 +3858,48 @@ union bpf_attr { * Get system cpus returned in *cpus*. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_cpumask_op(struct cpumask_op_args *op, int len) + * Description + * A series of cpumask-related operations. Perform different + * operations base on *op*->type. User also need fill other + * *op* field base on *op*->type. *op*->type is one of them + * + * **CPUMASK_EMPTY** + * *(op->arg1) == 0 returned. + * **CPUMASK_AND** + * *(op->arg1) = *(op->arg2) & *(op->arg3) + * **CPUMASK_ANDNOT** + * *(op->arg1) = *(op->arg2) & ~*(op->arg3) + * **CPUMASK_SUBSET** + * *(op->arg1) & ~*(op->arg2) == 0 returned + * **CPUMASK_EQUAL** + * *(op->arg1) == *(op->arg2) returned + * **CPUMASK_TEST_CPU** + * test for a cpu *(int)(op->arg1) in *(op->arg2) + * returns 1 if *op*->arg1 is set in *op*->arg2, else returns 0 + * **CPUMASK_COPY** + * *(op->arg1) = *(op->arg2), return 0 always + * **CPUMASK_WEIGHT** + * count of bits in *(op->arg1) + * **CPUMASK_NEXT** + * get the next cpu in *(struct cpumask *)(op->arg2) + * *(int *)(op->arg1): the cpu prior to the place to search + * **CPUMASK_NEXT_WRAP** + * helper to implement for_each_cpu_wrap + * @op->arg1: the cpu prior to the place to search + * @op->arg2: the cpumask pointer + * @op->arg3: the start point of the iteration + * @op->arg4: assume @op->arg1 crossing @op->arg3 terminates the iteration + * returns >= nr_cpu_ids on completion + * **CPUMASK_NEXT_AND** + * get the next cpu in *(op->arg1) & *(op->arg2) + * **CPUMASK_CPULIST_PARSE** + * extract a cpumask from a user string of ranges. + * (char *)op->arg1 -> (struct cpumask *)(op->arg2) + * 0 on success, or a negative error in case of failure. + * Return + * View above. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4034,6 +4076,7 @@ union bpf_attr { FN(sched_cpu_capacity_of), \ FN(init_cpu_topology), \ FN(get_cpumask_info), \ + FN(cpumask_op), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 6f2200170093..73834d29b614 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -398,6 +398,90 @@ static const struct bpf_func_proto bpf_sched_cpu_capacity_of_proto = { .arg3_type = ARG_CONST_SIZE, };
+BPF_CALL_2(bpf_cpumask_op, struct cpumask_op_args *, op, int, len) +{ + int ret; + + if (len != sizeof(*op) || !op->arg1) + return -EINVAL; + + switch (op->op_type) { + case CPUMASK_EMPTY: + return cpumask_empty((const struct cpumask *)op->arg1); + case CPUMASK_AND: + if (!op->arg2 || !op->arg3) + return -EINVAL; + return cpumask_and((struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2, + (const struct cpumask *)op->arg3); + case CPUMASK_ANDNOT: + if (!op->arg2 || !op->arg3) + return -EINVAL; + cpumask_andnot((struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2, + (const struct cpumask *)op->arg3); + break; + case CPUMASK_SUBSET: + if (!op->arg2) + return -EINVAL; + return cpumask_subset((const struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2); + case CPUMASK_EQUAL: + if (!op->arg2) + return -EINVAL; + return cpumask_equal((const struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2); + case CPUMASK_TEST_CPU: + if (!op->arg2) + return -EINVAL; + return cpumask_test_cpu(*(int *)op->arg1, op->arg2); + case CPUMASK_COPY: + if (!op->arg2) + return -EINVAL; + cpumask_copy((struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2); + break; + case CPUMASK_WEIGHT: + return cpumask_weight((const struct cpumask *)op->arg1); + case CPUMASK_NEXT: + if (!op->arg2) + return -EINVAL; + return cpumask_next(*(int *)op->arg1, + (const struct cpumask *)op->arg2); + case CPUMASK_NEXT_WRAP: + if (!op->arg2 || !op->arg3 || !op->arg4) + return -EINVAL; + return cpumask_next_wrap(*(int *)op->arg1, + (const struct cpumask *)op->arg2, + *(int *)op->arg3, *(int *)op->arg4); + case CPUMASK_NEXT_AND: + if (!op->arg2 || !op->arg3) + return -EINVAL; + return cpumask_next_and(*(int *)op->arg1, + (const struct cpumask *)op->arg2, + (const struct cpumask *)op->arg3); + case CPUMASK_CPULIST_PARSE: + if (!op->arg2) + return -EINVAL; + + op->arg1 = (void *)strstrip((void *)op->arg1); + ret = cpulist_parse((void *)op->arg1, + (struct cpumask *)op->arg2); + return ret; + default: + return -EINVAL; + } + + return 0; +} + +static const struct bpf_func_proto bpf_cpumask_op_proto = { + .func = bpf_cpumask_op, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, +};
static const struct bpf_func_proto * bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) @@ -429,6 +513,8 @@ bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_init_cpu_topology_proto; case BPF_FUNC_get_cpumask_info: return &bpf_get_cpumask_info_proto; + case BPF_FUNC_cpumask_op: + return &bpf_cpumask_op_proto; default: return bpf_base_func_proto(func_id); } diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index b99981bf62f2..2f9d2160b5fb 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -443,6 +443,8 @@ class PrinterHelpers(Printer): 'struct bpf_sched_cpu_capacity', 'struct bpf_cpu_topology', 'struct bpf_cpumask_info', + 'struct cpumask', + 'struct cpumask_op_args', ] known_types = { '...', @@ -494,6 +496,8 @@ class PrinterHelpers(Printer): 'struct bpf_sched_cpu_capacity', 'struct bpf_cpu_topology', 'struct bpf_cpumask_info', + 'struct cpumask', + 'struct cpumask_op_args', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b3be7de118d5..09b9dc45c49d 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3858,6 +3858,48 @@ union bpf_attr { * Get system cpus returned in *cpus*. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_cpumask_op(struct cpumask_op_args *op, int len) + * Description + * A series of cpumask-related operations. Perform different + * operations base on *op*->type. User also need fill other + * *op* field base on *op*->type. *op*->type is one of them + * + * **CPUMASK_EMPTY** + * *(op->arg1) == 0 returned. + * **CPUMASK_AND** + * *(op->arg1) = *(op->arg2) & *(op->arg3) + * **CPUMASK_ANDNOT** + * *(op->arg1) = *(op->arg2) & ~*(op->arg3) + * **CPUMASK_SUBSET** + * *(op->arg1) & ~*(op->arg2) == 0 returned + * **CPUMASK_EQUAL** + * *(op->arg1) == *(op->arg2) returned + * **CPUMASK_TEST_CPU** + * test for a cpu *(int)(op->arg1) in *(op->arg2) + * returns 1 if *op*->arg1 is set in *op*->arg2, else returns 0 + * **CPUMASK_COPY** + * *(op->arg1) = *(op->arg2), return 0 always + * **CPUMASK_WEIGHT** + * count of bits in *(op->arg1) + * **CPUMASK_NEXT** + * get the next cpu in *(struct cpumask *)(op->arg2) + * *(int *)(op->arg1): the cpu prior to the place to search + * **CPUMASK_NEXT_WRAP** + * helper to implement for_each_cpu_wrap + * @op->arg1: the cpu prior to the place to search + * @op->arg2: the cpumask pointer + * @op->arg3: the start point of the iteration + * @op->arg4: assume @op->arg1 crossing @op->arg3 terminates the iteration + * returns >= nr_cpu_ids on completion + * **CPUMASK_NEXT_AND** + * get the next cpu in *(op->arg1) & *(op->arg2) + * **CPUMASK_CPULIST_PARSE** + * extract a cpumask from a user string of ranges. + * (char *)op->arg1 -> (struct cpumask *)(op->arg2) + * 0 on success, or a negative error in case of failure. + * Return + * View above. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4034,6 +4076,7 @@ union bpf_attr { FN(sched_cpu_capacity_of), \ FN(init_cpu_topology), \ FN(get_cpumask_info), \ + FN(cpumask_op), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
From: Hui Tang tanghui20@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
Add helper function to check two cpu whehter share same LLC cache.
Signed-off-by: Hui Tang tanghui20@huawei.com --- include/uapi/linux/bpf.h | 7 +++++++ kernel/sched/bpf_sched.c | 19 +++++++++++++++++++ tools/include/uapi/linux/bpf.h | 7 +++++++ 3 files changed, 33 insertions(+)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b48890aef086..3bbce04f4c33 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3900,6 +3900,12 @@ union bpf_attr { * 0 on success, or a negative error in case of failure. * Return * View above. + * + * int bpf_cpus_share_cache(int src_cpu, int dst_cpu) + * Description + * check src_cpu whether share cache with dst_cpu. + * Return + * yes 1, no 0. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4077,6 +4083,7 @@ union bpf_attr { FN(init_cpu_topology), \ FN(get_cpumask_info), \ FN(cpumask_op), \ + FN(cpus_share_cache), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 73834d29b614..1584964c28d1 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -483,6 +483,23 @@ static const struct bpf_func_proto bpf_cpumask_op_proto = { .arg2_type = ARG_CONST_SIZE, };
+BPF_CALL_2(bpf_cpus_share_cache, int, src_cpu, int, dst_cpu) +{ + if ((unsigned int)src_cpu >= nr_cpu_ids || + (unsigned int)dst_cpu >= nr_cpu_ids) + return 0; + + return cpus_share_cache(src_cpu, dst_cpu); +} + +static const struct bpf_func_proto bpf_cpus_share_cache_proto = { + .func = bpf_cpus_share_cache, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -515,6 +532,8 @@ bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_cpumask_info_proto; case BPF_FUNC_cpumask_op: return &bpf_cpumask_op_proto; + case BPF_FUNC_cpus_share_cache: + return &bpf_cpus_share_cache_proto; default: return bpf_base_func_proto(func_id); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 09b9dc45c49d..a035635f938d 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3900,6 +3900,12 @@ union bpf_attr { * 0 on success, or a negative error in case of failure. * Return * View above. + * + * int bpf_cpus_share_cache(int src_cpu, int dst_cpu) + * Description + * check src_cpu whether share cache with dst_cpu. + * Return + * true yes, false no. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4077,6 +4083,7 @@ union bpf_attr { FN(init_cpu_topology), \ FN(get_cpumask_info), \ FN(cpumask_op), \ + FN(cpus_share_cache), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
Add lib for sched programmable, this functions help user program more easily.
The main functions are as follows: 1.Wrap the helper functions make more easily to use. 2.Implement some generic methods and policies for scheduler.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com --- tools/lib/bpf/libbpf_sched.h | 466 +++++++++++++++++++++++++++++++++++ 1 file changed, 466 insertions(+) create mode 100644 tools/lib/bpf/libbpf_sched.h
diff --git a/tools/lib/bpf/libbpf_sched.h b/tools/lib/bpf/libbpf_sched.h new file mode 100644 index 000000000000..9a0e6b0653df --- /dev/null +++ b/tools/lib/bpf/libbpf_sched.h @@ -0,0 +1,466 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#ifndef __LIBBPF_LIBSCHED_H +#define __LIBBPF_LIBSCHED_H + +#include <linux/bpf_topology.h> +#include <linux/version.h> +#include <uapi/linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +#define INVALID_PTR ((void *)(0UL)) +#define getVal(P) \ + ({ \ + typeof(P) val = 0; \ + bpf_probe_read_kernel(&val, sizeof(val), &(P)); \ + val; \ + }) + +static __always_inline long libbpf_cpumask_next(int n, struct cpumask *mask); +static __always_inline long libbpf_cpumask_next_wrap(int n, + struct cpumask *mask, + int start, int wrap); +static __always_inline long libbpf_cpumask_next_and(int n, + struct cpumask *mask1, + struct cpumask *mask2); +static __always_inline int libbpf_nr_cpus_ids(void); +static __always_inline int libbpf_nr_cpumask_bits(void); + +#if NR_CPUS == 1 + +#define libbpf_for_each_cpu(cpu, mask) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) +#define libbpf_for_each_cpu_wrap(cpu, mask, start) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)(start)) +#define libbpf_for_each_cpu_and(cpu, mask1, mask2) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask1, (void)mask2) + +#else + +#define libbpf_for_each_cpu(cpu, mask) \ + for (int __i = 0, (cpu) = -1; \ + (cpu) = libbpf_cpumask_next((cpu), (mask)), \ + (cpu) < libbpf_nr_cpus_ids() && __i < NR_CPUS; __i++) + +#define libbpf_for_each_cpu_wrap(cpu, mask, start) \ + for (int __i = 0, (cpu) = libbpf_cpumask_next_wrap((start) - 1,\ + (mask), (start), false); \ + (cpu) < libbpf_nr_cpumask_bits() && __i < NR_CPUS; \ + (cpu) = libbpf_cpumask_next_wrap((cpu), (mask), (start),\ + true), __i++) + +#define libbpf_for_each_cpu_and(cpu, mask1, mask2) \ + for (int __i = 0, (cpu) = -1; \ + (cpu) = libbpf_cpumask_next_and((cpu), (mask1), (mask2)),\ + (cpu) < libbpf_nr_cpus_ids() && __i < NR_CPUS; __i++) + +#endif + +static __always_inline long libbpf_cpumask_copy(struct cpumask *dst, + struct cpumask *src) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_COPY; + op.arg1 = dst; + op.arg2 = src; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_empty(struct cpumask *mask) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_EMPTY; + op.arg1 = mask; + op.arg2 = INVALID_PTR; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_and(struct cpumask *dst, + struct cpumask *src1, + struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_AND; + op.arg1 = dst; + op.arg2 = src1; + op.arg3 = src2; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_andnot(struct cpumask *dst, + struct cpumask *src1, + struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_ANDNOT; + op.arg1 = dst; + op.arg2 = src1; + op.arg3 = src2; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_subset(struct cpumask *src1, + struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_SUBSET; + op.arg1 = src1; + op.arg2 = src2; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_equal(struct cpumask *src1, + struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_EQUAL; + op.arg1 = src1; + op.arg2 = src2; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_weight(struct cpumask *src1) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_WEIGHT; + op.arg1 = src1; + op.arg2 = INVALID_PTR; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_test_cpu(int cpu, + struct cpumask *mask) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_TEST_CPU; + op.arg1 = &cpu; + op.arg2 = mask; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_next(int n, struct cpumask *mask) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_NEXT; + op.arg1 = &n; + op.arg2 = mask; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_next_wrap(int n, + struct cpumask *mask, + int start, int wrap) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_NEXT_WRAP; + op.arg1 = &n; + op.arg2 = mask; + op.arg3 = &start; + op.arg4 = &wrap; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_next_and(int n, + struct cpumask *mask1, + struct cpumask *mask2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_NEXT_AND; + op.arg1 = &n; + op.arg2 = mask1; + op.arg3 = mask2; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_cpulist_parse(char *src1, + struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_CPULIST_PARSE; + op.arg1 = src1; + op.arg2 = src2; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline int libbpf_num_active_cpus(void) +{ + struct bpf_cpumask_info cpus; + + bpf_get_cpumask_info(&cpus, sizeof(cpus)); + return getVal(cpus.nums_active_cpus); +} + +static __always_inline int libbpf_num_possible_cpus(void) +{ + struct bpf_cpumask_info cpus; + + bpf_get_cpumask_info(&cpus, sizeof(cpus)); + return getVal(cpus.nums_possible_cpus); +} + +static __always_inline void libbpf_possible_cpus_mask(struct cpumask *mask) +{ + struct bpf_cpumask_info cpus; + + bpf_get_cpumask_info(&cpus, sizeof(cpus)); + libbpf_cpumask_copy(mask, &cpus.cpu_possible_cpumask); +} + +static __always_inline void libbpf_active_cpus_mask(struct cpumask *mask) +{ + struct bpf_cpumask_info cpus; + + bpf_get_cpumask_info(&cpus, sizeof(cpus)); + libbpf_cpumask_copy(mask, &cpus.cpu_active_cpumask); +} + +static __always_inline void libbpf_isolate_cpus_mask(struct cpumask *mask) +{ + struct bpf_cpumask_info cpus; + + bpf_get_cpumask_info(&cpus, sizeof(cpus)); + libbpf_cpumask_copy(mask, &cpus.cpu_isolate_cpumask); +} + +static __always_inline int libbpf_nr_cpus_ids(void) +{ + struct bpf_cpumask_info cpus; + + bpf_get_cpumask_info(&cpus, sizeof(cpus)); + return getVal(cpus.nr_cpu_ids); +} + +static __always_inline int libbpf_nr_cpumask_bits(void) +{ + struct bpf_cpumask_info cpus; + + bpf_get_cpumask_info(&cpus, sizeof(cpus)); + return getVal(cpus.bpf_nr_cpumask_bits); +} + +static __always_inline unsigned long libbpf_cfs_load_avg_of(int cpu) +{ + struct bpf_sched_cpu_load load; + + bpf_sched_cpu_load_of(cpu, &load, sizeof(load)); + return getVal(load.cfs_load_avg); +} + +static __always_inline unsigned long libbpf_cfs_runnable_avg_of(int cpu) +{ + struct bpf_sched_cpu_load load; + + bpf_sched_cpu_load_of(cpu, &load, sizeof(load)); + return getVal(load.cfs_runnable_avg); +} + +static __always_inline unsigned long libbpf_cfs_util_avg_of(int cpu) +{ + struct bpf_sched_cpu_load load; + + bpf_sched_cpu_load_of(cpu, &load, sizeof(load)); + return getVal(load.cfs_util_avg); +} + +static __always_inline unsigned long libbpf_rt_load_avg_of(int cpu) +{ + struct bpf_sched_cpu_load load; + + bpf_sched_cpu_load_of(cpu, &load, sizeof(load)); + return load.rt_load_avg; +} + +static __always_inline unsigned long libbpf_rt_runnable_avg_of(int cpu) +{ + struct bpf_sched_cpu_load load; + + bpf_sched_cpu_load_of(cpu, &load, sizeof(load)); + return load.rt_runnable_avg; +} + +static __always_inline unsigned long libbpf_rt_util_avg_of(int cpu) +{ + struct bpf_sched_cpu_load load; + + bpf_sched_cpu_load_of(cpu, &load, sizeof(load)); + return load.rt_util_avg; +} + +static __always_inline unsigned long libbpf_irq_load_avg_of(int cpu) +{ + struct bpf_sched_cpu_load load; + + bpf_sched_cpu_load_of(cpu, &load, sizeof(load)); + return load.irq_load_avg; +} + +static __always_inline unsigned long libbpf_irq_util_avg_of(int cpu) +{ + struct bpf_sched_cpu_load load; + + bpf_sched_cpu_load_of(cpu, &load, sizeof(load)); + return load.irq_util_avg; +} + +static __always_inline unsigned int libbpf_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_nr_running running; + + bpf_sched_cpu_nr_running_of(cpu, &running, sizeof(running)); + return getVal(running.nr_running); +} + +static __always_inline unsigned int libbpf_cfs_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_nr_running running; + + bpf_sched_cpu_nr_running_of(cpu, &running, sizeof(running)); + return getVal(running.cfs_nr_running); +} + +static __always_inline unsigned int libbpf_cfs_h_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_nr_running running; + + bpf_sched_cpu_nr_running_of(cpu, &running, sizeof(running)); + return getVal(running.cfs_h_nr_running); +} + +static __always_inline unsigned int libbpf_cfs_idle_h_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_nr_running running; + + bpf_sched_cpu_nr_running_of(cpu, &running, sizeof(running)); + return running.cfs_idle_h_nr_running; +} + +static __always_inline unsigned int libbpf_rt_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_nr_running running; + + bpf_sched_cpu_nr_running_of(cpu, &running, sizeof(running)); + return getVal(running.rt_nr_running); +} + +static __always_inline unsigned int libbpf_rr_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_nr_running running; + + bpf_sched_cpu_nr_running_of(cpu, &running, sizeof(running)); + return running.rr_nr_running; +} + +static __always_inline unsigned int libbpf_exit_latency_of(int cpu) +{ + struct bpf_sched_cpu_idle_stat stat; + + bpf_sched_cpu_idle_stat_of(cpu, &stat, sizeof(stat)); + return stat.exit_latency; +} + +static __always_inline unsigned long libbpf_idle_stamp_of(int cpu) +{ + struct bpf_sched_cpu_idle_stat stat; + + bpf_sched_cpu_idle_stat_of(cpu, &stat, sizeof(stat)); + return stat.idle_stamp; +} + +static __always_inline unsigned long libbpf_avg_idle_of(int cpu) +{ + struct bpf_sched_cpu_idle_stat stat; + + bpf_sched_cpu_idle_stat_of(cpu, &stat, sizeof(stat)); + return stat.avg_idle; +} + +static __always_inline unsigned long libbpf_available_idle_cpu(int cpu) +{ + struct bpf_sched_cpu_idle_stat stat; + + bpf_sched_cpu_idle_stat_of(cpu, &stat, sizeof(stat)); + return getVal(stat.available_idle); +} + +static __always_inline unsigned long libbpf_capacity_of(int cpu) +{ + struct bpf_sched_cpu_capacity cap; + + bpf_sched_cpu_capacity_of(cpu, &cap, sizeof(cap)); + return getVal(cap.capacity); +} + +static __always_inline unsigned long libbpf_capacity_orig_of(int cpu) +{ + struct bpf_sched_cpu_capacity cap; + + bpf_sched_cpu_capacity_of(cpu, &cap, sizeof(cap)); + return cap.capacity_orig; +} + +static __always_inline int libbpf_cpus_share_cache(int src_cpu, int dst_cpu) +{ + return bpf_cpus_share_cache(src_cpu, dst_cpu); +} + +static __always_inline int libbpf_sched_se_tag_of(struct sched_entity *se) +{ + int se_tag = 0; + + if (bpf_sched_entity_is_task(se)) { + struct task_struct *task = bpf_sched_entity_to_task(se); + + se_tag = bpf_sched_task_tag_of(task); + } else { + struct task_group *tg = bpf_sched_entity_to_tg(se); + + se_tag = bpf_sched_tg_tag_of(tg); + } + + return se_tag; +} +#endif
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
Add three hooks of sched type in select_task_rq_fair(), as follows: 'cfs_select_rq' Replace the original core selection policy or implement dynamic CPU affinity.
'cfs_select_rq_exit' Restoring the CPU affinity of the task before exiting of 'select_task_rq_fair'.
To be used with 'cfs_select_rq' hook to implement dynamic CPU affinity.
'cfs_wake_affine' Determine on which CPU task can run soonest. Allow user to implement deferent policies.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com --- include/linux/sched.h | 20 +++++++++++++ include/linux/sched_hook_defs.h | 3 ++ kernel/sched/fair.c | 50 +++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 2 ++ 4 files changed, 75 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 1b0cc2bfbd75..626cc23b11ec 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2241,5 +2241,25 @@ enum cpumask_op_type { CPUMASK_NEXT_AND, CPUMASK_CPULIST_PARSE }; + +struct sched_migrate_ctx { + struct task_struct *task; + struct cpumask *cpus_allowed; + struct cpumask *select_idle_mask; + int prev_cpu; + int curr_cpu; + int is_sync; + int want_affine; + int wake_flags; + int sd_flag; + int new_cpu; +}; + +struct sched_affine_ctx { + struct task_struct *task; + int prev_cpu; + int curr_cpu; + int is_sync; +}; #endif #endif diff --git a/include/linux/sched_hook_defs.h b/include/linux/sched_hook_defs.h index e2f65e4b8895..4e359649db4b 100644 --- a/include/linux/sched_hook_defs.h +++ b/include/linux/sched_hook_defs.h @@ -3,3 +3,6 @@ BPF_SCHED_HOOK(int, 0, cfs_check_preempt_tick, struct sched_entity *curr, unsign BPF_SCHED_HOOK(int, 0, cfs_check_preempt_wakeup, struct task_struct *curr, struct task_struct *p) BPF_SCHED_HOOK(int, 0, cfs_wakeup_preempt_entity, struct sched_entity *curr, struct sched_entity *se) +BPF_SCHED_HOOK(int, -1, cfs_select_rq, struct sched_migrate_ctx *ctx) +BPF_SCHED_HOOK(int, -1, cfs_wake_affine, struct sched_affine_ctx *ctx) +BPF_SCHED_HOOK(int, -1, cfs_select_rq_exit, struct sched_migrate_ctx *ctx) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 79e36870b206..23206e6320f0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6023,6 +6023,22 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, { int target = nr_cpumask_bits;
+#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + struct sched_affine_ctx ctx; + int ret; + + ctx.task = p; + ctx.prev_cpu = prev_cpu; + ctx.curr_cpu = this_cpu; + ctx.is_sync = sync; + + ret = bpf_sched_cfs_wake_affine(&ctx); + if (ret >= 0 && ret < nr_cpumask_bits) + return ret; + } +#endif + if (sched_feat(WA_IDLE)) target = wake_affine_idle(this_cpu, prev_cpu, sync);
@@ -6887,6 +6903,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int new_cpu = prev_cpu; int want_affine = 0; int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); +#ifdef CONFIG_BPF_SCHED + struct sched_migrate_ctx ctx; + int ret; +#endif
time = schedstat_start_time();
@@ -6904,6 +6924,26 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f }
rcu_read_lock(); +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + ctx.task = p; + ctx.prev_cpu = prev_cpu; + ctx.curr_cpu = cpu; + ctx.is_sync = sync; + ctx.wake_flags = wake_flags; + ctx.want_affine = want_affine; + ctx.sd_flag = sd_flag; + ctx.cpus_allowed = (void *)p->cpus_ptr; + ctx.select_idle_mask = this_cpu_cpumask_var_ptr(select_idle_mask); + + ret = bpf_sched_cfs_select_rq(&ctx); + if (ret >= 0) { + rcu_read_unlock(); + return ret; + } + } +#endif + for_each_domain(cpu, tmp) { /* * If both 'cpu' and 'prev_cpu' are part of this domain, @@ -6935,6 +6975,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (want_affine) current->recent_used_cpu = cpu; } + +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + ctx.new_cpu = new_cpu; + ret = bpf_sched_cfs_select_rq_exit(&ctx); + if (ret >= 0) + new_cpu = ret; + } +#endif + rcu_read_unlock(); schedstat_end_time(cpu_rq(cpu), time);
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 2f9d2160b5fb..fd89d2f2a86d 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -445,6 +445,7 @@ class PrinterHelpers(Printer): 'struct bpf_cpumask_info', 'struct cpumask', 'struct cpumask_op_args', + 'struct sched_migrate_ctx', ] known_types = { '...', @@ -498,6 +499,7 @@ class PrinterHelpers(Printer): 'struct bpf_cpumask_info', 'struct cpumask', 'struct cpumask_op_args', + 'struct sched_migrate_ctx', } mapped_types = { 'u8': '__u8',
From: Hui Tang tanghui20@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
Add helper funciton to set cpus_ptr in task.
Signed-off-by: Hui Tang tanghui20@huawei.com --- include/uapi/linux/bpf.h | 7 +++++++ kernel/sched/bpf_sched.c | 23 +++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 7 +++++++ tools/lib/bpf/libbpf_sched.h | 7 +++++++ 4 files changed, 44 insertions(+)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3bbce04f4c33..ab02f3d077e7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3906,6 +3906,12 @@ union bpf_attr { * check src_cpu whether share cache with dst_cpu. * Return * yes 1, no 0. + * + * int bpf_sched_set_task_cpus_ptr(struct sched_migrate_ctx *h_ctx, struct cpumask *cpus, int len) + * Description + * set cpus_ptr in task. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4084,6 +4090,7 @@ union bpf_attr { FN(get_cpumask_info), \ FN(cpumask_op), \ FN(cpus_share_cache), \ + FN(sched_set_task_cpus_ptr), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 1584964c28d1..fc9c5e54f41f 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -500,6 +500,27 @@ static const struct bpf_func_proto bpf_cpus_share_cache_proto = { .arg2_type = ARG_ANYTHING, };
+BPF_CALL_3(bpf_sched_set_task_cpus_ptr, struct sched_migrate_ctx *, h_ctx, + struct cpumask *, cpus, int, len) +{ + if (len != sizeof(*cpus)) + return -EINVAL; + + h_ctx->task->cpus_ptr = cpus; + return 0; +} + +BTF_ID_LIST_SINGLE(bpf_sched_migrate_ctx_ids, struct, sched_migrate_ctx) + +static const struct bpf_func_proto bpf_sched_set_task_cpus_ptr_proto = { + .func = bpf_sched_set_task_cpus_ptr, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_sched_migrate_ctx_ids[0], + .arg2_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -534,6 +555,8 @@ bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_cpumask_op_proto; case BPF_FUNC_cpus_share_cache: return &bpf_cpus_share_cache_proto; + case BPF_FUNC_sched_set_task_cpus_ptr: + return &bpf_sched_set_task_cpus_ptr_proto; default: return bpf_base_func_proto(func_id); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a035635f938d..d00288ea5ba1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3906,6 +3906,12 @@ union bpf_attr { * check src_cpu whether share cache with dst_cpu. * Return * true yes, false no. + * + * int bpf_sched_set_task_cpus_ptr(struct sched_migrate_ctx *h_ctx, struct cpumask *cpus, int len) + * Description + * set cpus_ptr in task. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4084,6 +4090,7 @@ union bpf_attr { FN(get_cpumask_info), \ FN(cpumask_op), \ FN(cpus_share_cache), \ + FN(sched_set_task_cpus_ptr), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/tools/lib/bpf/libbpf_sched.h b/tools/lib/bpf/libbpf_sched.h index 9a0e6b0653df..6cb30e8e81f8 100644 --- a/tools/lib/bpf/libbpf_sched.h +++ b/tools/lib/bpf/libbpf_sched.h @@ -463,4 +463,11 @@ static __always_inline int libbpf_sched_se_tag_of(struct sched_entity *se)
return se_tag; } + +static __always_inline void libbpf_sched_set_task_cpus_ptr( + struct sched_migrate_ctx *h_ctx, + struct cpumask *cpus) +{ + bpf_sched_set_task_cpus_ptr(h_ctx, cpus, sizeof(*cpus)); +} #endif
From: Hui Tang tanghui20@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
1.Samples support hook of 'cfs_select_rq' 2.Samples support hook of 'cfs_wake_affine' 3.Samples support hook of 'cfs_select_exit'
Signed-off-by: Hui Tang tanghui20@huawei.com --- samples/bpf/Makefile | 3 + samples/bpf/sched_select_core_kern.c | 259 +++++++++++++++++++++++++++ samples/bpf/sched_select_core_user.c | 125 +++++++++++++ 3 files changed, 387 insertions(+) create mode 100644 samples/bpf/sched_select_core_kern.c create mode 100644 samples/bpf/sched_select_core_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index e473bad76549..62dadae992a2 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -55,6 +55,7 @@ tprogs-y += xdp_sample_pkts tprogs-y += ibumad tprogs-y += hbm tprogs-y += sched_preempt +tprogs-y += sched_select_core
# Libbpf dependencies LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a @@ -113,6 +114,7 @@ xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS) ibumad-objs := bpf_load.o ibumad_user.o $(TRACE_HELPERS) hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS) sched_preempt-objs := sched_preempt_user.o +sched_select_core-objs := sched_select_core_user.o
# Tell kbuild to always build the programs always-y := $(tprogs-y) @@ -175,6 +177,7 @@ always-y += hbm_out_kern.o always-y += hbm_edt_kern.o always-y += xdpsock_kern.o always-y += sched_preempt_kern.o +always-y += sched_select_core_kern.o
ifeq ($(ARCH), arm) # Strip all except -D__LINUX_ARM_ARCH__ option needed to handle linux diff --git a/samples/bpf/sched_select_core_kern.c b/samples/bpf/sched_select_core_kern.c new file mode 100644 index 000000000000..18617e89b395 --- /dev/null +++ b/samples/bpf/sched_select_core_kern.c @@ -0,0 +1,259 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Sample select core BPF program. + * 'cfs_select_rq' + * Replace the original core selection policy or + * implement dynamic CPU affinity. + * + * 'cfs_select_rq_exit' + * Restoring the CPU affinity of the task before exiting of + * 'select_task_rq_fair'. + * + * To be used with 'cfs_select_rq' hook to implement + * dynamic CPU affinity. + * + * 'cfs_wake_affine' + * Determine on which CPU task can run soonest. Allow user to + * implement deferent policies. + */ +#include <linux/version.h> +#include <linux/sched.h> +#include <uapi/linux/bpf.h> +#include <bpf/bpf_core_read.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/libbpf_sched.h> +#include <linux/cpumask.h> + +#define STR_MAX (32) +#define SELECT_RQ_RANGE (-1) +#define SELECT_RQ_EXIT_CPU_VALID (-2) + +/* From kernel/sched/sched.h */ +#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ +#define WF_FORK 0x02 /* Child wakeup after fork */ +#define WF_MIGRATED 0x04 /* Internal use, task got migrated */ +#define WF_ON_CPU 0x08 /* Wakee is on_cpu */ + +#define TAG_ID(id) TAG_##id + +enum tag_id { + TAG_NONE, + TAG_ID(1), + TAG_ID(2), + TAG_MAX +}; + +struct tag_info { + long tag; + char buf[STR_MAX]; +}; + +struct tag_info tag_tbl[] = { + {TAG_NONE, ""}, + {TAG_ID(1), "0-3"}, + {TAG_ID(2), "4-7"}, + {TAG_MAX, ""}, +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, int); + __uint(max_entries, 1); +} map_idlest_cpu SEC(".maps"); + +int sysctl_sched_util_low_pct = 85; + +static inline bool prefer_cpus_valid(struct cpumask *prefer_cpus, + struct cpumask *cpus_allowed) +{ + return !libbpf_cpumask_empty(prefer_cpus) && + !libbpf_cpumask_equal(prefer_cpus, cpus_allowed) && + libbpf_cpumask_subset(prefer_cpus, cpus_allowed); +} + +static struct cpumask *select_better_cpus(struct task_struct *p, + struct cpumask *prefer_cpus, + int *idlest_cpu) +{ + unsigned long util_avg_sum = 0; + unsigned long tg_capacity = 0; + unsigned int weight; + long min_util = INT_MIN; + struct task_group *tg; + long spare; + int cpu; + + if (!prefer_cpus_valid(prefer_cpus, (void *)getVal(p->cpus_ptr))) + return (void *)getVal(p->cpus_ptr); + + tg = p->sched_task_group; + libbpf_for_each_cpu(cpu, prefer_cpus) { + if (idlest_cpu && libbpf_available_idle_cpu(cpu)) { + *idlest_cpu = cpu; + } else if (idlest_cpu) { + spare = (long)(libbpf_capacity_of(cpu) - libbpf_cfs_util_avg_of(cpu)); + if (spare > min_util) { + min_util = spare; + *idlest_cpu = cpu; + } + } + + if (libbpf_available_idle_cpu(cpu)) + return getVal(prefer_cpus); + + util_avg_sum += libbpf_cfs_util_avg_of(cpu); + tg_capacity += libbpf_capacity_of(cpu); + } + + weight = libbpf_cpumask_weight(prefer_cpus); + if (tg_capacity > weight && + util_avg_sum * 100 <= tg_capacity * sysctl_sched_util_low_pct) { + return getVal(prefer_cpus); + } + + return (void *)getVal(p->cpus_ptr); +} + +SEC("sched/cfs_select_rq") +int BPF_PROG(cfs_select_cpu_range, struct sched_migrate_ctx *h_ctx) +{ + struct cpumask *prefer_cpus = getVal(h_ctx->select_idle_mask); + struct task_struct *p = getVal(h_ctx->task); + struct cpumask *cpus_ptr; + int type = SELECT_RQ_RANGE; + long tag = getVal(p->tag); + int *idlest_cpu = 0; + int key = 0; + int ret; + + if (tag <= TAG_NONE || tag >= TAG_MAX) + return type; + + ret = libbpf_cpumask_cpulist_parse(tag_tbl[tag].buf, prefer_cpus); + if (ret) + return type; + + idlest_cpu = bpf_map_lookup_elem(&map_idlest_cpu, &key); + if (!idlest_cpu) + return type; + + cpus_ptr = select_better_cpus(p, prefer_cpus, idlest_cpu); + libbpf_sched_set_task_cpus_ptr((void *)h_ctx, getVal(cpus_ptr)); + + return type; +} + +SEC("sched/cfs_select_rq_exit") +int BPF_PROG(cfs_select_cpu_range_exit, struct sched_migrate_ctx *h_ctx) +{ + int *idlest_cpu; + int key = 0; + + idlest_cpu = bpf_map_lookup_elem(&map_idlest_cpu, &key); + if (!idlest_cpu) { + libbpf_sched_set_task_cpus_ptr(h_ctx, (void *)getVal(h_ctx->cpus_allowed)); + return SELECT_RQ_EXIT_CPU_VALID; + } + + if (!libbpf_cpumask_test_cpu(getVal(h_ctx->new_cpu), + (void *)getVal(h_ctx->task->cpus_ptr))) { + libbpf_sched_set_task_cpus_ptr(h_ctx, (void *)getVal(h_ctx->cpus_allowed)); + return *idlest_cpu; + } + + libbpf_sched_set_task_cpus_ptr(h_ctx, (void *)getVal(h_ctx->cpus_allowed)); + return SELECT_RQ_EXIT_CPU_VALID; +} + +static int find_idlest_cpu(struct task_struct *p, int parent) +{ + unsigned long min = INT_MAX; + int min_load_cpu = 0; + unsigned long load; + int cpu; + int i; + + for (i = 0, cpu = -1; i < NR_CPUS; i++) { + cpu = libbpf_cpumask_next(cpu, (void *)getVal(p->cpus_ptr)); + if (cpu >= libbpf_nr_cpus_ids()) + break; + + load = libbpf_cfs_load_avg_of(cpu); + if (load < min) { + min = load; + min_load_cpu = cpu; + } + } + + return min_load_cpu; +} + +static int select_idle_cpu(struct task_struct *p, int parent, int prev_cpu) +{ + int cpu; + + if (libbpf_available_idle_cpu(prev_cpu)) + return prev_cpu; + + if (libbpf_available_idle_cpu(parent)) + return prev_cpu; + + libbpf_for_each_cpu_wrap(cpu, (void *)getVal(p->cpus_ptr), prev_cpu) { + if (libbpf_available_idle_cpu(cpu)) + return cpu; + } + + return prev_cpu; +} + +SEC("sched/cfs_select_rq") +int BPF_PROG(cfs_select_cpu, struct sched_migrate_ctx *h_ctx) +{ + struct task_struct *p = getVal(h_ctx->task); + int wake_flags = getVal(h_ctx->wake_flags); + int prev_cpu = getVal(h_ctx->prev_cpu); + int cpu = getVal(h_ctx->curr_cpu); + int new_cpu; + + if (wake_flags == WF_FORK) { + /* Slow path */ + new_cpu = find_idlest_cpu(p, cpu); + } else { + /* Fast path */ + new_cpu = select_idle_cpu(p, cpu, prev_cpu); + } + + return new_cpu; +} + +SEC("sched/cfs_wake_affine") +int BPF_PROG(cfs_wake_affine, struct sched_affine_ctx *h_ctx) +{ + int prev_cpu = getVal(h_ctx->prev_cpu); + int curr_cpu = getVal(h_ctx->curr_cpu); + int sync = getVal(h_ctx->is_sync); + + if (libbpf_available_idle_cpu(curr_cpu) && + libbpf_cpus_share_cache(curr_cpu, prev_cpu)) + return libbpf_available_idle_cpu(prev_cpu) ? prev_cpu : curr_cpu; + + if (sync && libbpf_nr_running_of(curr_cpu) == 1) + return curr_cpu; + + return prev_cpu; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/sched_select_core_user.c b/samples/bpf/sched_select_core_user.c new file mode 100644 index 000000000000..99c98f394478 --- /dev/null +++ b/samples/bpf/sched_select_core_user.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <unistd.h> +#include <sys/resource.h> +#include <bpf/libbpf.h> + +static void usage(void) +{ + printf("USAGE: test sched select core [...]\n"); + printf(" -W wakeup affine # Test sched wake wakeup\n"); + printf(" -C select core # Test sched select core\n"); + printf(" -R select core range # Test sched select core range\n"); + printf(" -h # Display this help\n"); +} + +#define TRACE_DIR "/sys/kernel/debug/tracing/" +#define BUF_SIZE (4096) + +/* read trace logs from debug fs */ +static void read_trace_pipe(void) +{ + int trace_fd; + + trace_fd = open(TRACE_DIR "trace_pipe", O_RDONLY, 0); + if (trace_fd < 0) + return; + + while (1) { + static char buf[BUF_SIZE]; + ssize_t sz; + + sz = read(trace_fd, buf, sizeof(buf) - 1); + if (sz > 0) { + buf[sz] = 0; + puts(buf); + } + } +} + +int main(int argc, char **argv) +{ + int opt; + char filename[256]; + char progname[4][256]; + struct bpf_object *obj; + struct bpf_program *prog[4] = {NULL}; + struct bpf_link *link[4] = {NULL}; + int prog_num = 1; + int i = 0; + + while ((opt = getopt(argc, argv, "C::R::W::E::")) != -1) { + switch (opt) { + case 'C': + snprintf(progname[0], sizeof(progname[0]), "cfs_select_cpu"); + break; + case 'R': + snprintf(progname[0], sizeof(progname[0]), "cfs_select_cpu_range"); + snprintf(progname[1], sizeof(progname[1]), "cfs_select_cpu_range_exit"); + prog_num = 2; + break; + case 'W': + snprintf(progname[0], sizeof(progname[0]), "cfs_wake_affine"); + break; + default: + usage(); + goto out; + } + } + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + goto out; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + for (i = 0; i < prog_num; i++) { + prog[i] = bpf_object__find_program_by_name(obj, progname[i]); + if (libbpf_get_error(prog[i])) { + fprintf(stderr, "ERROR: finding a prog %d in obj file failed\n", i); + goto cleanup; + } + + link[i] = bpf_program__attach(prog[i]); + if (libbpf_get_error(link[i])) { + fprintf(stderr, "ERROR: bpf_program__attach %d failed\n", i); + link[i] = NULL; + goto cleanup; + } + } + + printf("select rq BPF started, hit Ctrl+C to stop!\n"); + + read_trace_pipe(); + +cleanup: + for (; i >= 0; i--) + bpf_link__destroy(link[i]); + bpf_object__close(obj); +out: + return 0; +}
From: Guan Jing guanjing6@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
This hook point can change the position of se on the red-black tree, eg: in cloud scenarios, there will be online tasks that need to respond in time and offline tasks that do not need to respond in time. This hook point provides users with a way to customize that Class tasks run first. The basis for pick next task comes from system information, such as the red-black tree, and so on… If the system information of the CFS is modified, it will affect the whole system. Therefore, the hook function is added here. Only the position of the task on the red-black tree is modified, and the value of vruntime is not changed.
Signed-off-by: Guan Jing guanjing6@huawei.com --- include/linux/sched_hook_defs.h | 2 ++ kernel/sched/fair.c | 9 +++++++++ 2 files changed, 11 insertions(+)
diff --git a/include/linux/sched_hook_defs.h b/include/linux/sched_hook_defs.h index 4e359649db4b..07b3063d6f56 100644 --- a/include/linux/sched_hook_defs.h +++ b/include/linux/sched_hook_defs.h @@ -6,3 +6,5 @@ BPF_SCHED_HOOK(int, 0, cfs_wakeup_preempt_entity, struct sched_entity *curr, BPF_SCHED_HOOK(int, -1, cfs_select_rq, struct sched_migrate_ctx *ctx) BPF_SCHED_HOOK(int, -1, cfs_wake_affine, struct sched_affine_ctx *ctx) BPF_SCHED_HOOK(int, -1, cfs_select_rq_exit, struct sched_migrate_ctx *ctx) +BPF_SCHED_HOOK(int, 0, cfs_tag_pick_next_entity, struct sched_entity *curr, + struct sched_entity *next) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 23206e6320f0..e9e054ba2760 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -513,6 +513,15 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) static inline int entity_before(struct sched_entity *a, struct sched_entity *b) { +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + int ret = bpf_sched_cfs_tag_pick_next_entity(a, b); + + if (ret == 1) + return 1; + } +#endif + return (s64)(a->vruntime - b->vruntime) < 0; }
From: Guan Jing guanjing6@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
The sample bpf program pick_next_task_kern.o sets the online task to be the next task to run, When there are online tasks and offline tasks on rq.
Signed-off-by: Guan Jing guanjing6@huawei.com --- samples/bpf/Makefile | 3 + samples/bpf/sched_pick_task_kern.c | 56 ++++++++++++++++++ samples/bpf/sched_pick_task_user.c | 94 ++++++++++++++++++++++++++++++ 3 files changed, 153 insertions(+) create mode 100644 samples/bpf/sched_pick_task_kern.c create mode 100644 samples/bpf/sched_pick_task_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 62dadae992a2..1d92e87565ad 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -56,6 +56,7 @@ tprogs-y += ibumad tprogs-y += hbm tprogs-y += sched_preempt tprogs-y += sched_select_core +tprogs-y += sched_pick_task
# Libbpf dependencies LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a @@ -115,6 +116,7 @@ ibumad-objs := bpf_load.o ibumad_user.o $(TRACE_HELPERS) hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS) sched_preempt-objs := sched_preempt_user.o sched_select_core-objs := sched_select_core_user.o +sched_pick_task-objs := sched_pick_task_user.o
# Tell kbuild to always build the programs always-y := $(tprogs-y) @@ -178,6 +180,7 @@ always-y += hbm_edt_kern.o always-y += xdpsock_kern.o always-y += sched_preempt_kern.o always-y += sched_select_core_kern.o +always-y += sched_pick_task_kern.o
ifeq ($(ARCH), arm) # Strip all except -D__LINUX_ARM_ARCH__ option needed to handle linux diff --git a/samples/bpf/sched_pick_task_kern.c b/samples/bpf/sched_pick_task_kern.c new file mode 100644 index 000000000000..b7a48abaf01a --- /dev/null +++ b/samples/bpf/sched_pick_task_kern.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#include <linux/version.h> +#include <linux/sched.h> +#include <uapi/linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/libbpf_sched.h> + +#define PICK_CURR 1 +#define PICK_NOMAL 0 +#define ERROR -1 + +enum task_type { + TASK_TYPE_OFFLINE = -1, + TASK_TYPE_ONLINE, + TASK_TYPE_MAX +}; + +/* + * Only implements the effect of the task selection strategy + * and needs to be used in conjunction with preempt and + * load balance. If quota is not configured, the priority + * inversion leads to system crash. + */ +SEC("sched/cfs_tag_pick_next_entity") +int BPF_PROG(sched_cfs_tag_pick_next_entity, struct sched_entity *curr, struct sched_entity *next) +{ + int curr_type = 0; + int next_type = 0; + + if (curr == NULL || next == NULL) + return PICK_NOMAL; + + curr_type = libbpf_sched_se_tag_of(curr); + next_type = libbpf_sched_se_tag_of(next); + + if (curr_type > next_type) + return PICK_CURR; + + return PICK_NOMAL; +} + +char _license[] SEC("license") = "GPL"; + diff --git a/samples/bpf/sched_pick_task_user.c b/samples/bpf/sched_pick_task_user.c new file mode 100644 index 000000000000..0c8a24393bd2 --- /dev/null +++ b/samples/bpf/sched_pick_task_user.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <unistd.h> +#include <sys/resource.h> +#include <bpf/libbpf.h> + +#define TRACE_DIR "/sys/kernel/debug/tracing/" +#define BUF_SIZE (4096) + +/* read trace logs from debug fs */ +void read_trace_pipe(void) +{ + int trace_fd; + + trace_fd = open(TRACE_DIR "trace_pipe", O_RDONLY, 0); + if (trace_fd < 0) + return; + + while (1) { + static char buf[BUF_SIZE]; + ssize_t sz; + + sz = read(trace_fd, buf, sizeof(buf) - 1); + if (sz > 0) { + buf[sz] = 0; + puts(buf); + } + } +} + +int main(int argc, char **argv) +{ + char filename[256]; + struct bpf_object *obj; + struct bpf_program *prog; + struct bpf_link *link; + int err; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + /* Open BPF application */ + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 1; + } + + /* Load and verify BPF program */ + err = bpf_object__load(obj); + if (err) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + prog = bpf_object__find_program_by_name(obj, "sched_cfs_tag_pick_next_entity"); + if (libbpf_get_error(prog)) { + fprintf(stderr, "ERROR: finding a prog in obj file failed\n"); + goto cleanup; + } + + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + link = NULL; + goto cleanup; + } + + printf("preempt BPF started, hit Ctrl+C to stop!\n"); + + read_trace_pipe(); + +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); +out: + return 0; +}