Add programmable schedule feature for openEuler-22.09.
Chen Hui (12): sched: programmable: Add a tag for the task group sched: programmable: Add user interface of task group tag sched: programmable: Add a tag for the task sched: programmable: Add user interface of task tag sched: programmable: add bpf_sched_task_tag_of helper function sched: programmable: Add convenient helper functions to convert sched entity bpf: BPF samples support SCHED program type samples: bpf: Add bpf sched preempt sample program bpf:programmable: Add four helper functions to get cpu stat bpf:programmable: Add cpumask ops collection sched: programmable: Add lib for sched programmable sched: programmable: Add hook for select_task_rq_fair
Guan Jing (2): sched: programmable: Add hook for entity_before samples: bpf: Add bpf sched pick task sample
Hui Tang (3): bpf:programmable: Add helper func to check cpu share cache bpf:programmable: Add helper func to set cpus_ptr in task samples:bpf: Add samples for cfs select core
Ren Zhijie (3): sched: programmable: add bpf_sched_tg_tag_of helper function sched: programmable: Add helpers to set tag of task or task_group sched: programmable: Add helper function for cpu topology.
fs/proc/base.c | 65 ++++ include/linux/bpf_topology.h | 46 +++ include/linux/sched.h | 85 ++++++ include/linux/sched_hook_defs.h | 7 +- include/uapi/linux/bpf.h | 112 +++++++ init/init_task.c | 3 + kernel/bpf/helpers.c | 12 + kernel/bpf/verifier.c | 4 +- kernel/sched/Makefile | 3 +- kernel/sched/bpf_sched.c | 409 +++++++++++++++++++++++++ kernel/sched/bpf_topology.c | 99 ++++++ kernel/sched/core.c | 103 +++++++ kernel/sched/fair.c | 58 ++++ kernel/sched/sched.h | 8 + samples/bpf/Makefile | 9 + samples/bpf/bpf_load.c | 8 +- samples/bpf/sched_pick_task_kern.c | 62 ++++ samples/bpf/sched_pick_task_user.c | 92 ++++++ samples/bpf/sched_preempt_kern.c | 151 ++++++++++ samples/bpf/sched_preempt_user.c | 139 +++++++++ samples/bpf/sched_select_core_kern.c | 239 +++++++++++++++ samples/bpf/sched_select_core_user.c | 114 +++++++ scripts/bpf_helpers_doc.py | 20 ++ tools/include/uapi/linux/bpf.h | 112 +++++++ tools/lib/bpf/libbpf_sched.h | 435 +++++++++++++++++++++++++++ 25 files changed, 2389 insertions(+), 6 deletions(-) create mode 100644 include/linux/bpf_topology.h create mode 100644 kernel/sched/bpf_topology.c create mode 100644 samples/bpf/sched_pick_task_kern.c create mode 100644 samples/bpf/sched_pick_task_user.c create mode 100644 samples/bpf/sched_preempt_kern.c create mode 100644 samples/bpf/sched_preempt_user.c create mode 100644 samples/bpf/sched_select_core_kern.c create mode 100644 samples/bpf/sched_select_core_user.c create mode 100644 tools/lib/bpf/libbpf_sched.h
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
Add a tag for the task group, to support the tag-based scheduling mechanism.
The tag is used to identify a special task or a type of special tasks, there are many special tasks in the real world, such as foreground and background tasks, online and offline tasks, ect. so, we can identify such special tasks, and execute specific policies.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com --- kernel/sched/core.c | 19 +++++++++++++++++++ kernel/sched/sched.h | 5 +++++ 2 files changed, 24 insertions(+)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 51c707897c8d..c53c032a378a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7755,6 +7755,13 @@ static void sched_free_group(struct task_group *tg) kmem_cache_free(task_group_cache, tg); }
+#ifdef CONFIG_BPF_SCHED +static inline void tg_init_tag(struct task_group *tg, struct task_group *ptg) +{ + tg->tag = ptg->tag; +} +#endif + /* allocate runqueue etc for a new task group */ struct task_group *sched_create_group(struct task_group *parent) { @@ -7775,6 +7782,10 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err;
+#ifdef CONFIG_BPF_SCHED + tg_init_tag(tg, parent); +#endif + alloc_uclamp_sched_group(tg, parent);
return tg; @@ -7846,6 +7857,14 @@ static void sched_change_group(struct task_struct *tsk, int type) sched_change_qos_group(tsk, tg); #endif
+#ifdef CONFIG_BPF_SCHED + /* + * This function has cleared and restored the task status, + * so we do not need to dequeue and enqueue the task again. + */ + tsk->tag = tg->tag; +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_change_group) tsk->sched_class->task_change_group(tsk, type); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 42d5fb7d9464..67fd5cbbefea 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -455,6 +455,11 @@ struct task_group { struct uclamp_se uclamp[UCLAMP_CNT]; #endif
+#ifdef CONFIG_BPF_SCHED + /* Used to pad the tag of a group */ + long tag; +#endif + KABI_RESERVE(1) KABI_RESERVE(2) KABI_RESERVE(3)
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
Add user interface of task group tag, bridges the information gap between user-mode and kernel-mode.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com --- kernel/sched/core.c | 81 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c53c032a378a..2256ac22d26d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8637,6 +8637,80 @@ static inline s64 cpu_qos_read(struct cgroup_subsys_state *css, } #endif
+#ifdef CONFIG_BPF_SCHED +static void sched_settag(struct task_struct *tsk, s64 tag) +{ + int queued, running, queue_flags = + DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; + struct rq_flags rf; + struct rq *rq; + + if (tsk->tag == tag) + return; + + rq = task_rq_lock(tsk, &rf); + + running = task_current(rq, tsk); + queued = task_on_rq_queued(tsk); + + update_rq_clock(rq); + if (queued) + dequeue_task(rq, tsk, queue_flags); + if (running) + put_prev_task(rq, tsk); + + tsk->tag = tag; + + if (queued) + enqueue_task(rq, tsk, queue_flags); + if (running) + set_next_task(rq, tsk); + + task_rq_unlock(rq, tsk, &rf); +} + +static int tg_change_tag(struct task_group *tg, void *data) +{ + struct css_task_iter it; + struct task_struct *tsk; + s64 tag = *(s64 *)data; + struct cgroup_subsys_state *css = &tg->css; + + tg->tag = tag; + + css_task_iter_start(css, 0, &it); + while ((tsk = css_task_iter_next(&it))) + sched_settag(tsk, tag); + css_task_iter_end(&it); + + return 0; +} + +static int cpu_tag_write(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 tag) +{ + struct task_group *tg = css_tg(css); + + if (tg == &root_task_group) + return -EINVAL; + + if (tg->tag == tag) + return 0; + + rcu_read_lock(); + walk_tg_tree_from(tg, tg_change_tag, tg_nop, (void *)(&tag)); + rcu_read_unlock(); + + return 0; +} + +static inline s64 cpu_tag_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return css_tg(css)->tag; +} +#endif + static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { @@ -8698,6 +8772,13 @@ static struct cftype cpu_legacy_files[] = { .read_s64 = cpu_qos_read, .write_s64 = cpu_qos_write, }, +#endif +#ifdef CONFIG_BPF_SCHED + { + .name = "tag", + .read_s64 = cpu_tag_read, + .write_s64 = cpu_tag_write, + }, #endif { } /* Terminate */ };
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
Add a tag for the task, same as the task group.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com --- include/linux/sched.h | 5 +++++ init/init_task.c | 3 +++ kernel/sched/core.c | 3 +++ 3 files changed, 11 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 47f462040f4d..08803b8664b3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1397,6 +1397,11 @@ struct task_struct { */ randomized_struct_fields_end
+#ifdef CONFIG_BPF_SCHED + /* Used to pad the tag of a task */ + long tag; +#endif + KABI_RESERVE(1) KABI_RESERVE(2) KABI_RESERVE(3) diff --git a/init/init_task.c b/init/init_task.c index 5fa18ed59d33..7003426df677 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -213,6 +213,9 @@ struct task_struct init_task #ifdef CONFIG_SECCOMP_FILTER .seccomp = { .filter_count = ATOMIC_INIT(0) }, #endif +#ifdef CONFIG_BPF_SCHED + .tag = 0, +#endif }; EXPORT_SYMBOL(init_task);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2256ac22d26d..8f385aa1c201 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3113,6 +3113,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_SMP p->wake_entry.u_flags = CSD_TYPE_TTWU; #endif +#ifdef CONFIG_BPF_SCHED + p->tag = 0; +#endif }
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
Add user interface of task tag, bridges the information gap between user-mode and kernel mode.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com --- fs/proc/base.c | 65 +++++++++++++++++++++++++++++++++++++++++++ include/linux/sched.h | 4 +++ kernel/sched/core.c | 2 +- 3 files changed, 70 insertions(+), 1 deletion(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c index b9052be86e8d..8ae7c2be70c2 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3644,6 +3644,68 @@ static const struct inode_operations proc_tid_comm_inode_operations = { .permission = proc_tid_comm_permission, };
+#ifdef CONFIG_BPF_SCHED +static ssize_t pid_tag_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *tsk; + char buffer[PROC_NUMBUF]; + int err = 0, tag = 0; + + tsk = get_proc_task(inode); + if (!tsk) + return -ESRCH; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &tag); + if (err) + goto out; + + sched_settag(tsk, tag); + +out: + put_task_struct(tsk); + return err < 0 ? err : count; +} + +static int pid_tag_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *tsk; + + tsk = get_proc_task(inode); + if (!tsk) + return -ESRCH; + + seq_printf(m, "%ld\n", tsk->tag); + put_task_struct(tsk); + + return 0; +} + +static int pid_tag_open(struct inode *inode, struct file *flip) +{ + return single_open(flip, pid_tag_show, inode); +} + +static const struct file_operations proc_pid_tag_operations = { + .open = pid_tag_open, + .read = seq_read, + .write = pid_tag_write, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + /* * Tasks */ @@ -3751,6 +3813,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_ASCEND_SHARE_POOL ONE("sp_group", 0444, proc_sp_group_state), #endif +#ifdef CONFIG_BPF_SCHED + REG("tag", 0644, proc_pid_tag_operations), +#endif };
static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/include/linux/sched.h b/include/linux/sched.h index 08803b8664b3..4981139e42e4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2181,4 +2181,8 @@ static inline int sched_qos_cpu_overload(void) return 0; } #endif + +#ifdef CONFIG_BPF_SCHED +extern void sched_settag(struct task_struct *tsk, s64 tag); +#endif #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8f385aa1c201..1eb5f5e03295 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8641,7 +8641,7 @@ static inline s64 cpu_qos_read(struct cgroup_subsys_state *css, #endif
#ifdef CONFIG_BPF_SCHED -static void sched_settag(struct task_struct *tsk, s64 tag) +void sched_settag(struct task_struct *tsk, s64 tag) { int queued, running, queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
From: Ren Zhijie renzhijie2@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
This helper function read the task group tag for a task.
Signed-off-by: Ren Zhijie renzhijie2@huawei.com Signed-off-by: Chen Hui judy.chenhui@huawei.com --- include/uapi/linux/bpf.h | 7 +++++++ kernel/bpf/verifier.c | 4 ++-- kernel/sched/bpf_sched.c | 23 +++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 2 ++ tools/include/uapi/linux/bpf.h | 7 +++++++ 5 files changed, 41 insertions(+), 2 deletions(-)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d5fbbc28b6a0..0bff54992b85 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3777,6 +3777,12 @@ union bpf_attr { * to be enabled. * Return * 1 if the sched entity belongs to a cgroup, 0 otherwise. + * + * long bpf_sched_tg_tag_of(struct task_group *tg) + * Description + * Return task group tag of *tg* if CONFIG_CGROUP_SCHED enabled. + * Return + * Task group tag, if CONFIG_CGROUP_SCHED enabled, 0 otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3940,6 +3946,7 @@ union bpf_attr { FN(sched_entity_to_tgidpid), \ FN(sched_entity_to_cgrpid), \ FN(sched_entity_belongs_to_cgrp), \ + FN(sched_tg_tag_of), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d26104b258ba..7acc2cd0081f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5020,10 +5020,10 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn) int i;
for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) { - if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) + if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) return false;
- if (fn->arg_type[i] != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i]) + if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i]) return false; }
diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 2ce2afcacb17..44a2ae6be1ec 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -112,6 +112,27 @@ static const struct bpf_func_proto bpf_sched_entity_belongs_to_cgrp_proto = { .arg2_type = ARG_ANYTHING, };
+BPF_CALL_1(bpf_sched_tg_tag_of, struct task_group *, tg) +{ + int ret = 0; + +#ifdef CONFIG_CGROUP_SCHED + ret = tg->tag; +#endif + + return ret; +} + +BTF_ID_LIST_SINGLE(btf_sched_tg_ids, struct, task_group) + +static const struct bpf_func_proto bpf_sched_tg_tag_of_proto = { + .func = bpf_sched_tg_tag_of, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_tg_ids[0], +}; + static const struct bpf_func_proto * bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -124,6 +145,8 @@ bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sched_entity_to_cgrpid_proto; case BPF_FUNC_sched_entity_belongs_to_cgrp: return &bpf_sched_entity_belongs_to_cgrp_proto; + case BPF_FUNC_sched_tg_tag_of: + return &bpf_sched_tg_tag_of_proto; default: return bpf_base_func_proto(func_id); } diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index be21512ee7be..f8a778ac9ce1 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -436,6 +436,7 @@ class PrinterHelpers(Printer): 'struct path', 'struct btf_ptr', 'struct sched_entity', + 'struct task_group', ] known_types = { '...', @@ -480,6 +481,7 @@ class PrinterHelpers(Printer): 'struct path', 'struct btf_ptr', 'struct sched_entity', + 'struct task_group', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b2a0b189b797..aed46483130e 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3777,6 +3777,12 @@ union bpf_attr { * to be enabled. * Return * 1 if the sched entity belongs to a cgroup, 0 otherwise. + * + * long bpf_sched_tg_tag_of(struct task_group *tg) + * Description + * Return task group tag of *tg* if CONFIG_CGROUP_SCHED enabled. + * Return + * Task group tag, if CONFIG_CGROUP_SCHED enabled, 0 otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3940,6 +3946,7 @@ union bpf_attr { FN(sched_entity_to_tgidpid), \ FN(sched_entity_to_cgrpid), \ FN(sched_entity_belongs_to_cgrp), \ + FN(sched_tg_tag_of), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
This helper function read the tag of the struct task.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com --- include/uapi/linux/bpf.h | 7 +++++++ kernel/sched/bpf_sched.c | 17 +++++++++++++++++ tools/include/uapi/linux/bpf.h | 7 +++++++ 3 files changed, 31 insertions(+)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0bff54992b85..7297c6e4f882 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3783,6 +3783,12 @@ union bpf_attr { * Return task group tag of *tg* if CONFIG_CGROUP_SCHED enabled. * Return * Task group tag, if CONFIG_CGROUP_SCHED enabled, 0 otherwise. + * + * long bpf_sched_task_tag_of(struct task_struct *tsk) + * Description + * Return task tag of *tsk*. + * Return + * Task tag, if used, 0 otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3947,6 +3953,7 @@ union bpf_attr { FN(sched_entity_to_cgrpid), \ FN(sched_entity_belongs_to_cgrp), \ FN(sched_tg_tag_of), \ + FN(sched_task_tag_of), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 44a2ae6be1ec..cea13d36272f 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -133,6 +133,21 @@ static const struct bpf_func_proto bpf_sched_tg_tag_of_proto = { .arg1_btf_id = &btf_sched_tg_ids[0], };
+BPF_CALL_1(bpf_sched_task_tag_of, struct task_struct *, tsk) +{ + return tsk->tag; +} + +BTF_ID_LIST_SINGLE(btf_sched_task_ids, struct, task_struct) + +static const struct bpf_func_proto bpf_sched_task_tag_of_proto = { + .func = bpf_sched_task_tag_of, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_task_ids[0], +}; + static const struct bpf_func_proto * bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -147,6 +162,8 @@ bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sched_entity_belongs_to_cgrp_proto; case BPF_FUNC_sched_tg_tag_of: return &bpf_sched_tg_tag_of_proto; + case BPF_FUNC_sched_task_tag_of: + return &bpf_sched_task_tag_of_proto; default: return bpf_base_func_proto(func_id); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index aed46483130e..874b7b66451b 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3783,6 +3783,12 @@ union bpf_attr { * Return task group tag of *tg* if CONFIG_CGROUP_SCHED enabled. * Return * Task group tag, if CONFIG_CGROUP_SCHED enabled, 0 otherwise. + * + * long bpf_sched_task_tag_of(struct task_struct *tsk) + * Description + * Return task tag of *tsk*. + * Return + * Task tag, if used, 0 otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3947,6 +3953,7 @@ union bpf_attr { FN(sched_entity_to_cgrpid), \ FN(sched_entity_belongs_to_cgrp), \ FN(sched_tg_tag_of), \ + FN(sched_task_tag_of), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
Add three helper functions: 1) bpf_sched_entity_is_task is to check whether the sched entity is a task struct. 2) bpf_sched_entity_to_task is to change the sched entity to a task struct. 3) bpf_sched_entity_to_tg is to change the sched entity to a task group.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com --- include/uapi/linux/bpf.h | 21 ++++++++++++ kernel/sched/bpf_sched.c | 61 ++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 21 ++++++++++++ 3 files changed, 103 insertions(+)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7297c6e4f882..59c94dd320a2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3789,6 +3789,24 @@ union bpf_attr { * Return task tag of *tsk*. * Return * Task tag, if used, 0 otherwise. + * + * long bpf_sched_entity_is_task(struct sched_entity *se) + * Description + * Checks whether the sched entity is a task. + * Return + * 1 if true, 0 otherwise. + * + * struct task_struct *bpf_sched_entity_to_task(struct sched_entity *se) + * Description + * Return task struct of *se* if se is a task. + * Return + * Task struct if se is a task, NULL otherwise. + * + * struct task_group *bpf_sched_entity_to_tg(struct sched_entity *se) + * Description + * Return task group of *se* if se is a task group. + * Return + * Task struct if se is a task group, NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3954,6 +3972,9 @@ union bpf_attr { FN(sched_entity_belongs_to_cgrp), \ FN(sched_tg_tag_of), \ FN(sched_task_tag_of), \ + FN(sched_entity_is_task), \ + FN(sched_entity_to_task), \ + FN(sched_entity_to_tg), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index cea13d36272f..d95dea1e3a66 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -148,6 +148,61 @@ static const struct bpf_func_proto bpf_sched_task_tag_of_proto = { .arg1_btf_id = &btf_sched_task_ids[0], };
+BPF_CALL_1(bpf_sched_entity_is_task, struct sched_entity *, se) +{ + return entity_is_task(se) ? 1 : 0; +} + +static const struct bpf_func_proto bpf_sched_entity_is_task_proto = { + .func = bpf_sched_entity_is_task, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_entity_ids[0], +}; + +BPF_CALL_1(bpf_sched_entity_to_task, struct sched_entity *, se) +{ + if (entity_is_task(se)) { + struct task_struct *tsk = task_of(se); + + return (unsigned long)tsk; + } + + return (unsigned long)NULL; +} + +static const struct bpf_func_proto bpf_sched_entity_to_task_proto = { + .func = bpf_sched_entity_to_task, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .ret_btf_id = &btf_sched_task_ids[0], + .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_entity_ids[0], +}; + +BPF_CALL_1(bpf_sched_entity_to_tg, struct sched_entity *, se) +{ +#if CONFIG_FAIR_GROUP_SCHED + if (!entity_is_task(se)) { + struct task_group *tg = group_cfs_rq(se)->tg; + + return (unsigned long)tg; + } +#endif + + return (unsigned long)NULL; +} + +static const struct bpf_func_proto bpf_sched_entity_to_tg_proto = { + .func = bpf_sched_entity_to_tg, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .ret_btf_id = &btf_sched_tg_ids[0], + .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_entity_ids[0], +}; + static const struct bpf_func_proto * bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -164,6 +219,12 @@ bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sched_tg_tag_of_proto; case BPF_FUNC_sched_task_tag_of: return &bpf_sched_task_tag_of_proto; + case BPF_FUNC_sched_entity_is_task: + return &bpf_sched_entity_is_task_proto; + case BPF_FUNC_sched_entity_to_task: + return &bpf_sched_entity_to_task_proto; + case BPF_FUNC_sched_entity_to_tg: + return &bpf_sched_entity_to_tg_proto; default: return bpf_base_func_proto(func_id); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 874b7b66451b..7aede7f62f5b 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3789,6 +3789,24 @@ union bpf_attr { * Return task tag of *tsk*. * Return * Task tag, if used, 0 otherwise. + * + * long bpf_sched_entity_is_task(struct sched_entity *se) + * Description + * Checks whether the sched entity is a task. + * Return + * 1 if true, 0 otherwise. + * + * struct task_struct *bpf_sched_entity_to_task(struct sched_entity *se) + * Description + * Return task struct of *se* if se is a task. + * Return + * Task struct if se is a task, NULL otherwise. + * + * struct task_group *bpf_sched_entity_to_tg(struct sched_entity *se) + * Description + * Return task group of *se* if se is a task group. + * Return + * Task struct if se is a task group, NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3954,6 +3972,9 @@ union bpf_attr { FN(sched_entity_belongs_to_cgrp), \ FN(sched_tg_tag_of), \ FN(sched_task_tag_of), \ + FN(sched_entity_is_task), \ + FN(sched_entity_to_task), \ + FN(sched_entity_to_tg), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
BPF samples support SCHED program type.
Signed-off-by: Chen Hui judy.chenhui@huawei.com --- samples/bpf/bpf_load.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index c5ad528f046e..4dfb45d254b1 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -87,6 +87,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) bool is_sockops = strncmp(event, "sockops", 7) == 0; bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0; bool is_sk_msg = strncmp(event, "sk_msg", 6) == 0; + bool is_sched = strncmp(event, "sched/", 6) == 0; size_t insns_cnt = size / sizeof(struct bpf_insn); enum bpf_prog_type prog_type; char buf[256]; @@ -120,6 +121,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) prog_type = BPF_PROG_TYPE_SK_SKB; } else if (is_sk_msg) { prog_type = BPF_PROG_TYPE_SK_MSG; + } else if (is_sched) { + prog_type = BPF_PROG_TYPE_SCHED; } else { printf("Unknown event '%s'\n", event); return -1; @@ -137,7 +140,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
prog_fd[prog_cnt++] = fd;
- if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk) + if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk || is_sched) return 0;
if (is_socket || is_sockops || is_sk_skb || is_sk_msg) { @@ -643,7 +646,8 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) memcmp(shname, "cgroup/", 7) == 0 || memcmp(shname, "sockops", 7) == 0 || memcmp(shname, "sk_skb", 6) == 0 || - memcmp(shname, "sk_msg", 6) == 0) { + memcmp(shname, "sk_msg", 6) == 0 || + memcmp(shname, "sched/", 6) == 0) { ret = load_and_attach(shname, data->d_buf, data->d_size); if (ret != 0)
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
This program can choose program by args [-W|-T|-E] to attach BPF_SCHED hooks which in preempt process and then matches task or task group which has tag TASK_TYPE_ONLINE and TASK_TYPE_OFFLINE.
This will return different value to hook function which use to indicate whether to preempt the current sched entity.
To run,
# sched_preempt [-W|-T|-E|-h]
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com --- samples/bpf/Makefile | 3 + samples/bpf/sched_preempt_kern.c | 151 +++++++++++++++++++++++++++++++ samples/bpf/sched_preempt_user.c | 139 ++++++++++++++++++++++++++++ 3 files changed, 293 insertions(+) create mode 100644 samples/bpf/sched_preempt_kern.c create mode 100644 samples/bpf/sched_preempt_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index aeebf5d12f32..e473bad76549 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -54,6 +54,7 @@ tprogs-y += task_fd_query tprogs-y += xdp_sample_pkts tprogs-y += ibumad tprogs-y += hbm +tprogs-y += sched_preempt
# Libbpf dependencies LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a @@ -111,6 +112,7 @@ task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS) xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS) ibumad-objs := bpf_load.o ibumad_user.o $(TRACE_HELPERS) hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS) +sched_preempt-objs := sched_preempt_user.o
# Tell kbuild to always build the programs always-y := $(tprogs-y) @@ -172,6 +174,7 @@ always-y += ibumad_kern.o always-y += hbm_out_kern.o always-y += hbm_edt_kern.o always-y += xdpsock_kern.o +always-y += sched_preempt_kern.o
ifeq ($(ARCH), arm) # Strip all except -D__LINUX_ARM_ARCH__ option needed to handle linux diff --git a/samples/bpf/sched_preempt_kern.c b/samples/bpf/sched_preempt_kern.c new file mode 100644 index 000000000000..1e315af9e009 --- /dev/null +++ b/samples/bpf/sched_preempt_kern.c @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#include <linux/version.h> +#include <linux/sched.h> +#include <uapi/linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +unsigned long idea_runtime = 1000000UL; + +enum task_type { + TASK_TYPE_OFFLINE = -1, + TASK_TYPE_NORMAL, + TASK_TYPE_ONLINE, +}; + +#define getVal(P) \ + ({ \ + typeof(P) val = 0; \ + bpf_probe_read_kernel(&val, sizeof(val), &(P)); \ + val; \ + }) + +#define bprintk(fmt, ...) \ + ({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ + }) + +SEC("sched/cfs_check_preempt_wakeup") +int BPF_PROG(sched_cfs_check_preempt_wakeup, struct task_struct *curr, struct task_struct *p) +{ + long curr_type, p_type; + int ret = 0; + + curr_type = bpf_sched_task_tag_of(curr); + p_type = bpf_sched_task_tag_of(p); + + if (curr_type == TASK_TYPE_ONLINE && p_type == TASK_TYPE_OFFLINE) + ret = -1; + + if (curr_type == TASK_TYPE_OFFLINE && p_type == TASK_TYPE_ONLINE) + ret = 1; + + bprintk("check_preempt_wakeup: curr id = %d, p id = %d, preempt result is %d\n", + getVal(curr->pid), getVal(p->pid), ret); + + return ret; +} + +SEC("sched/cfs_check_preempt_tick") +int BPF_PROG(sched_cfs_check_preempt_tick, struct sched_entity *curr, unsigned long delta_exec) +{ + long curr_type = TASK_TYPE_NORMAL; + int ret = 0, id = 0; + + if (bpf_sched_entity_is_task(curr)) { + struct task_struct *tsk = bpf_sched_entity_to_task(curr); + + if (tsk) { + curr_type = bpf_sched_task_tag_of(tsk); + id = getVal(tsk->pid); + } + bprintk("check_preempt_tick: delta = %lu,entity is task, id = %d\n", + delta_exec, id); + } else { + struct task_group *tg = bpf_sched_entity_to_tg(curr); + + if (tg) { + curr_type = bpf_sched_tg_tag_of(tg); + id = bpf_sched_entity_to_cgrpid(curr); + } + bprintk("check_preempt_tick: delta = %lu,entity is group, id = %d\n", + delta_exec, id); + } + + if (curr_type == TASK_TYPE_ONLINE) + ret = delta_exec >= idea_runtime ? 1 : -1; + + bprintk("check_preempt_tick: preempt result = %d\n", ret); + return ret; +} + +SEC("sched/cfs_wakeup_preempt_entity") +int BPF_PROG(sched_cfs_wakeup_preempt_entity, struct sched_entity *curr, struct sched_entity *se) +{ + long curr_type = TASK_TYPE_NORMAL; + long p_type = TASK_TYPE_NORMAL; + int curr_id = 0, p_id = 0; + int ret = 0; + + if (bpf_sched_entity_is_task(curr)) { + struct task_struct *tsk = bpf_sched_entity_to_task(curr); + + if (tsk) { + curr_type = bpf_sched_task_tag_of(tsk); + curr_id = getVal(tsk->pid); + bprintk("wakeup_preempt_entity: curr entity is task, id = %d\n", curr_id); + } + } else { + struct task_group *tg = bpf_sched_entity_to_tg(curr); + + if (tg) { + curr_type = bpf_sched_tg_tag_of(tg); + curr_id = bpf_sched_entity_to_cgrpid(curr); + bprintk("wakeup_preempt_entity: curr entity is group, id = %d\n", curr_id); + } + } + + if (bpf_sched_entity_is_task(se)) { + struct task_struct *p = bpf_sched_entity_to_task(se); + + if (p) { + p_type = bpf_sched_task_tag_of(p); + p_id = getVal(p->pid); + bprintk("wakeup_preempt_entity: se entity is task, id = %d\n", p_id); + } + } else { + struct task_group *tg1 = bpf_sched_entity_to_tg(se); + + if (tg1) { + p_type = bpf_sched_tg_tag_of(tg1); + p_id = bpf_sched_entity_to_cgrpid(se); + bprintk("wakeup_preempt_entity: se entity is group, id = %d\n", p_id); + } + } + + if (curr_type == TASK_TYPE_ONLINE && p_type == TASK_TYPE_OFFLINE) + ret = -1; + + if (curr_type == TASK_TYPE_OFFLINE && p_type == TASK_TYPE_ONLINE) + ret = 1; + + bprintk("wakeup_preempt_entity: preempt result = %d\n", ret); + + return ret; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/sched_preempt_user.c b/samples/bpf/sched_preempt_user.c new file mode 100644 index 000000000000..26f1f36d4558 --- /dev/null +++ b/samples/bpf/sched_preempt_user.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <unistd.h> +#include <sys/resource.h> +#include <bpf/libbpf.h> + +#define MAX_PROGS (3) +#define TRACE_DIR "/sys/kernel/debug/tracing/" +#define BUF_SIZE (4096) + +int progindex[MAX_PROGS]; + +static void usage(void) +{ + printf("USAGE: sched_preempt [...]\n"); + printf(" -W # Test sched preempt wakeup\n"); + printf(" -T # Test sched preempt tick\n"); + printf(" -E # Test wakeup preempt entity\n"); + printf(" -h # Display this help\n"); +} + +/* read trace logs from debug fs */ +static void read_trace_pipe(void) +{ + int trace_fd; + + trace_fd = open(TRACE_DIR "trace_pipe", O_RDONLY, 0); + if (trace_fd < 0) + return; + + while (1) { + static char buf[BUF_SIZE]; + ssize_t sz; + + sz = read(trace_fd, buf, sizeof(buf) - 1); + if (sz > 0) { + buf[sz] = 0; + puts(buf); + } + } +} + +static inline bool check_attach_prog(int index) +{ + return progindex[index] ? true : false; +} + +int main(int argc, char **argv) +{ + int opt; + int index; + char filename[256]; + struct bpf_object *obj; + struct bpf_program *prog; + struct bpf_link *link = NULL; + + char prognames[MAX_PROGS][256] = { + "sched_cfs_check_preempt_wakeup", + "sched_cfs_check_preempt_tick", + "sched_cfs_wakeup_preempt_entity", + }; + + while ((opt = getopt(argc, argv, "WTEh")) != -1) { + switch (opt) { + case 'W': + progindex[0] = 1; + break; + case 'T': + progindex[1] = 1; + break; + case 'E': + progindex[2] = 1; + break; + case 'h': + default: + usage(); + goto out; + } + } + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + goto out; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + for (index = 0; index < MAX_PROGS; ++index) { + if (check_attach_prog(index)) { + prog = bpf_object__find_program_by_name(obj, prognames[index]); + if (libbpf_get_error(prog)) { + fprintf(stderr, "ERROR: finding a prog:%s in obj file failed\n", + prognames[index]); + goto cleanup; + } + + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; + } + } + } + + printf("preempt BPF started, hit Ctrl+C to stop!\n"); + + read_trace_pipe(); + +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); + +out: + return 0; +}
From: Ren Zhijie renzhijie2@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
Add helper function bpf_sched_set_tg_tag() and bpf_sched_set_task_tag() to set tag of task group or task.
Signed-off-by: Ren Zhijie renzhijie2@huawei.com --- include/uapi/linux/bpf.h | 14 ++++++++++++ kernel/bpf/helpers.c | 6 +++++ kernel/sched/bpf_sched.c | 41 ++++++++++++++++++++++++++++++++++ kernel/sched/core.c | 2 +- kernel/sched/sched.h | 3 +++ tools/include/uapi/linux/bpf.h | 14 ++++++++++++ 6 files changed, 79 insertions(+), 1 deletion(-)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 59c94dd320a2..091877cea33e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3807,6 +3807,18 @@ union bpf_attr { * Return task group of *se* if se is a task group. * Return * Task struct if se is a task group, NULL otherwise. + * + * int bpf_sched_set_tg_tag(struct task_group *tg, s64 tag) + * Description + * Set tag to *tg* and its descendants. + * Return + * 0 on success, or a negative error in case of failure. + * + * void bpf_sched_set_task_tag(struct task_struct *tsk, s64 tag) + * Description + * Set tag to *tsk*. + * Return + * Nothing. Always succeeds. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3975,6 +3987,8 @@ union bpf_attr { FN(sched_entity_is_task), \ FN(sched_entity_to_task), \ FN(sched_entity_to_tg), \ + FN(sched_set_tg_tag), \ + FN(sched_set_task_tag), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 4bb5921a7d21..41c273db0ca8 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -658,6 +658,8 @@ const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; +const struct bpf_func_proto bpf_sched_set_tg_tag_proto __weak; +const struct bpf_func_proto bpf_sched_set_task_tag_proto __weak;
const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) @@ -715,6 +717,10 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_per_cpu_ptr_proto; case BPF_FUNC_this_cpu_ptr: return &bpf_this_cpu_ptr_proto; + case BPF_FUNC_sched_set_tg_tag: + return &bpf_sched_set_tg_tag_proto; + case BPF_FUNC_sched_set_task_tag: + return &bpf_sched_set_task_tag_proto; default: break; } diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index d95dea1e3a66..50218064d74a 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -203,6 +203,47 @@ static const struct bpf_func_proto bpf_sched_entity_to_tg_proto = { .arg1_btf_id = &btf_sched_entity_ids[0], };
+BPF_CALL_2(bpf_sched_set_tg_tag, struct task_group *, tg, s64, tag) +{ +#if CONFIG_CGROUP_SCHED + if (tg == &root_task_group) + return -EINVAL; + + if (tg->tag == tag) + return 0; + + rcu_read_lock(); + walk_tg_tree_from(tg, tg_change_tag, tg_nop, (void *)(&tag)); + rcu_read_unlock(); + + return 0; +#endif + return -EPERM; +} + +const struct bpf_func_proto bpf_sched_set_tg_tag_proto = { + .func = bpf_sched_set_tg_tag, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_tg_ids[0], + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_sched_set_task_tag, struct task_struct *, tsk, s64, tag) +{ + sched_settag(tsk, tag); + return 0; +} + +const struct bpf_func_proto bpf_sched_set_task_tag_proto = { + .func = bpf_sched_set_task_tag, + .gpl_only = false, + .ret_type = RET_VOID, + .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_task_ids[0], + .arg2_type = ARG_ANYTHING, +}; static const struct bpf_func_proto * bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1eb5f5e03295..c0fd31446c70 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8672,7 +8672,7 @@ void sched_settag(struct task_struct *tsk, s64 tag) task_rq_unlock(rq, tsk, &rf); }
-static int tg_change_tag(struct task_group *tg, void *data) +int tg_change_tag(struct task_group *tg, void *data) { struct css_task_iter it; struct task_struct *tsk; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 67fd5cbbefea..0423f7d422b1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -498,6 +498,9 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) }
extern int tg_nop(struct task_group *tg, void *data); +#ifdef CONFIG_BPF_SCHED +extern int tg_change_tag(struct task_group *tg, void *data); +#endif
extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7aede7f62f5b..a751da128857 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3807,6 +3807,18 @@ union bpf_attr { * Return task group of *se* if se is a task group. * Return * Task struct if se is a task group, NULL otherwise. + * + * int bpf_sched_set_tg_tag(struct task_group *tg, s64 tag) + * Description + * Set tag to *tg* and its descendants. + * Return + * 0 on success, or a negative error in case of failure. + * + * void bpf_sched_set_task_tag(struct task_struct *tsk, s64 tag) + * Description + * Set tag to *tsk*. + * Return + * Nothing. Always succeeds. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3975,6 +3987,8 @@ union bpf_attr { FN(sched_entity_is_task), \ FN(sched_entity_to_task), \ FN(sched_entity_to_tg), \ + FN(sched_set_tg_tag), \ + FN(sched_set_task_tag), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
Add four helper functions to get cpu stat.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com --- include/linux/sched.h | 33 ++++++++ include/uapi/linux/bpf.h | 28 +++++++ kernel/sched/bpf_sched.c | 139 +++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 8 ++ tools/include/uapi/linux/bpf.h | 28 +++++++ 5 files changed, 236 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 4981139e42e4..6796437eec45 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2184,5 +2184,38 @@ static inline int sched_qos_cpu_overload(void)
#ifdef CONFIG_BPF_SCHED extern void sched_settag(struct task_struct *tsk, s64 tag); + +struct bpf_sched_cpu_load { + unsigned long cfs_load_avg; + unsigned long cfs_runnable_avg; + unsigned long cfs_util_avg; + unsigned long rt_load_avg; + unsigned long rt_runnable_avg; + unsigned long rt_util_avg; + unsigned long irq_load_avg; + unsigned long irq_runnable_avg; + unsigned long irq_util_avg; +}; + +struct bpf_sched_cpu_nr_running { + unsigned int nr_running; + unsigned int cfs_nr_running; + unsigned int cfs_h_nr_runing; + unsigned int cfs_idle_h_nr_running; + unsigned int rt_nr_running; + unsigned int rr_nr_running; +}; + +struct bpf_sched_cpu_idle_stat { + int available_idle; + unsigned int exit_latency; + unsigned long idle_stamp; + unsigned long avg_idle; +}; + +struct bpf_sched_cpu_capacity { + unsigned long capacity; + unsigned long capacity_orig; +}; #endif #endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 091877cea33e..c53cb91f67d1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3819,6 +3819,30 @@ union bpf_attr { * Set tag to *tsk*. * Return * Nothing. Always succeeds. + * + * int bpf_sched_cpu_load_of(int cpu, struct bpf_sched_cpu_load *ctx, int len) + * Description + * Get *cpu* load returned in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_cpu_nr_running_of(int cpu, struct bpf_sched_cpu_nr_running *ctx, int len) + * Description + * Get *cpu* nr running returned in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_cpu_idle_stat_of(int cpu, struct bpf_sched_cpu_idle_stat *ctx, int len) + * Description + * Get *cpu* idle state returned in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_cpu_capacity_of(int cpu, struct bpf_sched_cpu_capacity *ctx, int len) + * Description + * Get *cpu* capacity returned in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3989,6 +4013,10 @@ union bpf_attr { FN(sched_entity_to_tg), \ FN(sched_set_tg_tag), \ FN(sched_set_task_tag), \ + FN(sched_cpu_load_of), \ + FN(sched_cpu_nr_running_of), \ + FN(sched_cpu_idle_stat_of), \ + FN(sched_cpu_capacity_of), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 50218064d74a..cc63638b5e81 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -244,6 +244,137 @@ const struct bpf_func_proto bpf_sched_set_task_tag_proto = { .arg1_btf_id = &btf_sched_task_ids[0], .arg2_type = ARG_ANYTHING, }; + +BPF_CALL_3(bpf_sched_cpu_load_of, int, cpu, + struct bpf_sched_cpu_load *, ctx, + int, len) +{ + struct rq *rq = cpu_rq(cpu); + + if (len != sizeof(*ctx)) + return -EINVAL; + + memset(ctx, 0, sizeof(struct bpf_sched_cpu_load)); +#ifdef CONFIG_SMP + SCHED_WARN_ON(!rcu_read_lock_held()); + ctx->cfs_load_avg = rq->cfs.avg.load_avg; + ctx->cfs_runnable_avg = rq->cfs.avg.runnable_avg; + ctx->cfs_util_avg = rq->cfs.avg.util_avg; + ctx->rt_load_avg = rq->avg_rt.load_avg; + ctx->rt_runnable_avg = rq->avg_rt.runnable_avg; + ctx->rt_util_avg = rq->avg_rt.util_avg; +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ + ctx->irq_load_avg = rq->avg_irq.load_avg; + ctx->irq_runnable_avg = rq->avg_irq.runnable_avg; + ctx->irq_util_avg = rq->avg_irq.util_avg; +#endif +#endif + + return 0; +} + +static const struct bpf_func_proto bpf_sched_cpu_load_of_proto = { + .func = bpf_sched_cpu_load_of, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +BPF_CALL_3(bpf_sched_cpu_nr_running_of, int, cpu, + struct bpf_sched_cpu_nr_running *, ctx, + int, len) +{ + struct rq *rq = cpu_rq(cpu); + + if (len != sizeof(*ctx)) + return -EINVAL; + + SCHED_WARN_ON(!rcu_read_lock_held()); + + ctx->nr_running = rq->nr_running; + ctx->cfs_nr_running = rq->cfs.nr_running; + ctx->cfs_h_nr_runing = rq->cfs.h_nr_running; + ctx->cfs_idle_h_nr_running = rq->cfs.idle_h_nr_running; + ctx->rt_nr_running = rq->rt.rt_nr_running; + ctx->rr_nr_running = rq->rt.rr_nr_running; + + return 0; +} + +static const struct bpf_func_proto bpf_sched_cpu_nr_running_of_proto = { + .func = bpf_sched_cpu_nr_running_of, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +BPF_CALL_3(bpf_sched_cpu_idle_stat_of, int, cpu, + struct bpf_sched_cpu_idle_stat *, ctx, + int, len) +{ + struct rq *rq = cpu_rq(cpu); + struct cpuidle_state *idle = NULL; + + if (len != sizeof(*ctx)) + return -EINVAL; + + memset(ctx, 0, sizeof(struct bpf_sched_cpu_idle_stat)); + SCHED_WARN_ON(!rcu_read_lock_held()); + ctx->available_idle = available_idle_cpu(cpu); + idle = idle_get_state(rq); + if (idle) + ctx->exit_latency = idle->exit_latency; + +#ifdef CONFIG_SMP + ctx->idle_stamp = rq->idle_stamp; + ctx->avg_idle = rq->avg_idle; +#endif + + return 0; +} + +static const struct bpf_func_proto bpf_sched_cpu_idle_stat_of_proto = { + .func = bpf_sched_cpu_idle_stat_of, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +BPF_CALL_3(bpf_sched_cpu_capacity_of, int, cpu, + struct bpf_sched_cpu_capacity *, ctx, + int, len) +{ + struct rq *rq = cpu_rq(cpu); + + if (len != sizeof(*ctx)) + return -EINVAL; + + memset(ctx, 0, sizeof(struct bpf_sched_cpu_capacity)); +#ifdef CONFIG_SMP + SCHED_WARN_ON(!rcu_read_lock_held()); + ctx->capacity = rq->cpu_capacity; + ctx->capacity_orig = rq->cpu_capacity_orig; +#endif + + return 0; +} + +static const struct bpf_func_proto bpf_sched_cpu_capacity_of_proto = { + .func = bpf_sched_cpu_capacity_of, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + + static const struct bpf_func_proto * bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -266,6 +397,14 @@ bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sched_entity_to_task_proto; case BPF_FUNC_sched_entity_to_tg: return &bpf_sched_entity_to_tg_proto; + case BPF_FUNC_sched_cpu_load_of: + return &bpf_sched_cpu_load_of_proto; + case BPF_FUNC_sched_cpu_nr_running_of: + return &bpf_sched_cpu_nr_running_of_proto; + case BPF_FUNC_sched_cpu_idle_stat_of: + return &bpf_sched_cpu_idle_stat_of_proto; + case BPF_FUNC_sched_cpu_capacity_of: + return &bpf_sched_cpu_capacity_of_proto; default: return bpf_base_func_proto(func_id); } diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index f8a778ac9ce1..f2b5e63801ca 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -437,6 +437,10 @@ class PrinterHelpers(Printer): 'struct btf_ptr', 'struct sched_entity', 'struct task_group', + 'struct bpf_sched_cpu_load', + 'struct bpf_sched_cpu_nr_running', + 'struct bpf_sched_cpu_idle_stat', + 'struct bpf_sched_cpu_capacity', ] known_types = { '...', @@ -482,6 +486,10 @@ class PrinterHelpers(Printer): 'struct btf_ptr', 'struct sched_entity', 'struct task_group', + 'struct bpf_sched_cpu_load', + 'struct bpf_sched_cpu_nr_running', + 'struct bpf_sched_cpu_idle_stat', + 'struct bpf_sched_cpu_capacity', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a751da128857..7d315debb910 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3819,6 +3819,30 @@ union bpf_attr { * Set tag to *tsk*. * Return * Nothing. Always succeeds. + * + * int bpf_sched_cpu_load_of(int cpu, struct bpf_sched_cpu_load *ctx, int len) + * Description + * Get *cpu* load returned in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_cpu_nr_running_of(int cpu, struct bpf_sched_cpu_nr_running *ctx, int len) + * Description + * Get *cpu* nr running returned in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_cpu_idle_stat_of(int cpu, struct bpf_sched_cpu_idle_stat *ctx, int len) + * Description + * Get *cpu* idle state returned in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_cpu_capacity_of(int cpu, struct bpf_sched_cpu_capacity *ctx, int len) + * Description + * Get *cpu* capacity returned in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3989,6 +4013,10 @@ union bpf_attr { FN(sched_entity_to_tg), \ FN(sched_set_tg_tag), \ FN(sched_set_task_tag), \ + FN(sched_cpu_load_of), \ + FN(sched_cpu_nr_running_of), \ + FN(sched_cpu_idle_stat_of), \ + FN(sched_cpu_capacity_of), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
From: Ren Zhijie renzhijie2@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
Add bpf helper function bpf_init_cpu_topology() which obtains cpu topology info through the macros topology_* that are defined by include/linux/topology.h, and save it in BPF MAP.
The cpu topology info are useful to select core in userspace.
Signed-off-by: Ren Zhijie renzhijie2@huawei.com Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Ren Zhijie renzhijie2@huawei.com --- include/linux/bpf_topology.h | 46 ++++++++++++++++ include/uapi/linux/bpf.h | 14 +++++ kernel/bpf/helpers.c | 6 +++ kernel/sched/Makefile | 3 +- kernel/sched/bpf_sched.c | 1 + kernel/sched/bpf_topology.c | 99 ++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 4 ++ tools/include/uapi/linux/bpf.h | 14 +++++ 8 files changed, 186 insertions(+), 1 deletion(-) create mode 100644 include/linux/bpf_topology.h create mode 100644 kernel/sched/bpf_topology.c
diff --git a/include/linux/bpf_topology.h b/include/linux/bpf_topology.h new file mode 100644 index 000000000000..d0b7540c3580 --- /dev/null +++ b/include/linux/bpf_topology.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _LINUX_BPF_TOPOLOGY_H +#define _LINUX_BPF_TOPOLOGY_H + +#include <linux/cpumask.h> + +struct bpf_cpu_topology { + int cpu; + int core_id; + int cluster_id; + int die_id; + int physical_package_id; + int numa_node; + struct cpumask thread_siblings; + struct cpumask core_siblings; + struct cpumask cluster_cpus; + struct cpumask die_cpus; + struct cpumask package_cpus; + struct cpumask node_cpu_lists; +}; + +struct bpf_system_cpus { + unsigned int nums_possible_cpus; + unsigned int nums_active_cpus; + unsigned int nums_isolate_cpus; + unsigned int nr_cpu_ids; + unsigned int bpf_nr_cpumask_bits; + struct cpumask cpu_possible_cpumask; + struct cpumask cpu_active_cpumask; + struct cpumask cpu_isolate_cpumask; +}; + +#endif /* _LINUX_BPF_TOPOLOGY_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c53cb91f67d1..ac8e951c9990 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3843,6 +3843,18 @@ union bpf_attr { * Get *cpu* capacity returned in *ctx*. * Return * 0 on success, or a negative error in case of failure. + * + * long bpf_init_cpu_topology(struct bpf_map *map, u64 flags) + * Description + * Initializing the cpu topology which used for bpf prog. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_get_system_cpus(struct bpf_system_cpus *cpus, int len) + * Description + * Get system cpus returned in *cpus*. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4017,6 +4029,8 @@ union bpf_attr { FN(sched_cpu_nr_running_of), \ FN(sched_cpu_idle_stat_of), \ FN(sched_cpu_capacity_of), \ + FN(init_cpu_topology), \ + FN(get_system_cpus), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 41c273db0ca8..391361b6d045 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -660,6 +660,8 @@ const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; const struct bpf_func_proto bpf_sched_set_tg_tag_proto __weak; const struct bpf_func_proto bpf_sched_set_task_tag_proto __weak; +const struct bpf_func_proto bpf_init_cpu_topology_proto __weak; +const struct bpf_func_proto bpf_get_system_cpus_proto __weak;
const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) @@ -699,6 +701,10 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ringbuf_discard_proto; case BPF_FUNC_ringbuf_query: return &bpf_ringbuf_query_proto; + case BPF_FUNC_init_cpu_topology: + return &bpf_init_cpu_topology_proto; + case BPF_FUNC_get_system_cpus: + return &bpf_get_system_cpus_proto; default: break; } diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 8ae9e39eb83a..c809d5c28424 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -36,4 +36,5 @@ obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_CPU_ISOLATION) += isolation.o obj-$(CONFIG_PSI) += psi.o -obj-$(CONFIG_BPF_SCHED) += bpf_sched.o \ No newline at end of file +obj-$(CONFIG_BPF_SCHED) += bpf_sched.o +obj-$(CONFIG_BPF_SCHED) += bpf_topology.o \ No newline at end of file diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index cc63638b5e81..89845caae2f2 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -4,6 +4,7 @@ #include <linux/bpf_verifier.h> #include <linux/bpf_sched.h> #include <linux/btf_ids.h> +#include <linux/bpf_topology.h> #include "sched.h"
DEFINE_STATIC_KEY_FALSE(bpf_sched_enabled_key); diff --git a/kernel/sched/bpf_topology.c b/kernel/sched/bpf_topology.c new file mode 100644 index 000000000000..7106da506f72 --- /dev/null +++ b/kernel/sched/bpf_topology.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/bpf.h> +#include <linux/btf_ids.h> +#include <linux/bpf_verifier.h> +#include <linux/topology.h> +#include <linux/cpumask.h> +#include <linux/bpf_topology.h> +#include <linux/sched/isolation.h> + +static void bpf_update_cpu_topology(struct bpf_cpu_topology *cpu_topology, int cpu) +{ + cpu_topology->cpu = cpu; + cpu_topology->core_id = topology_core_id(cpu); + cpu_topology->cluster_id = topology_cluster_id(cpu); + cpu_topology->die_id = topology_die_id(cpu); + cpu_topology->physical_package_id = topology_physical_package_id(cpu); + cpu_topology->numa_node = cpu_to_node(cpu); + cpumask_copy(&cpu_topology->thread_siblings, topology_sibling_cpumask(cpu)); + cpumask_copy(&cpu_topology->core_siblings, topology_core_cpumask(cpu)); + cpumask_copy(&cpu_topology->cluster_cpus, topology_cluster_cpumask(cpu)); + cpumask_copy(&cpu_topology->die_cpus, topology_die_cpumask(cpu)); + cpumask_copy(&cpu_topology->package_cpus, topology_core_cpumask(cpu)); + cpumask_copy(&cpu_topology->node_cpu_lists, cpumask_of_node(cpu_to_node(cpu))); +} + +BPF_CALL_2(bpf_init_cpu_topology, struct bpf_map *, map, u64, flags) +{ + const struct cpumask *cpu_map = cpu_active_mask; + int ret = 0; + int i = -1; + + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + for_each_cpu(i, cpu_map) { + struct bpf_cpu_topology topo; + + bpf_update_cpu_topology(&topo, i); + ret = map->ops->map_update_elem(map, &i, &topo, flags); + if (ret) { + int idx = i; + + for (; idx >= 0; idx--) + map->ops->map_delete_elem(map, &idx); + break; + } + } + + return ret; +} + +BTF_ID_LIST_SINGLE(bpf_cpu_topology_ids, struct, bpf_cpu_topology) + +const struct bpf_func_proto bpf_init_cpu_topology_proto = { + .func = bpf_init_cpu_topology, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_get_system_cpus, struct bpf_system_cpus *, cpus, + int, len) +{ + if (len != sizeof(*cpus)) + return -EINVAL; + + memset(cpus, 0, sizeof(struct bpf_system_cpus)); + + cpumask_copy(&cpus->cpu_possible_cpumask, cpu_possible_mask); + cpumask_copy(&cpus->cpu_active_cpumask, cpu_active_mask); + cpumask_copy(&cpus->cpu_isolate_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN)); + cpus->nums_possible_cpus = num_possible_cpus(); + cpus->nums_active_cpus = num_active_cpus(); + cpus->nums_isolate_cpus = cpumask_weight(&cpus->cpu_isolate_cpumask); + cpus->nr_cpu_ids = nr_cpu_ids; + cpus->bpf_nr_cpumask_bits = nr_cpumask_bits; + + return 0; +} + +const struct bpf_func_proto bpf_get_system_cpus_proto = { + .func = bpf_get_system_cpus, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE, +}; diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index f2b5e63801ca..1a55adee2a6d 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -441,6 +441,8 @@ class PrinterHelpers(Printer): 'struct bpf_sched_cpu_nr_running', 'struct bpf_sched_cpu_idle_stat', 'struct bpf_sched_cpu_capacity', + 'struct bpf_cpu_topology', + 'struct bpf_system_cpus', ] known_types = { '...', @@ -490,6 +492,8 @@ class PrinterHelpers(Printer): 'struct bpf_sched_cpu_nr_running', 'struct bpf_sched_cpu_idle_stat', 'struct bpf_sched_cpu_capacity', + 'struct bpf_cpu_topology', + 'struct bpf_system_cpus', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7d315debb910..8a09dfc1dad5 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3843,6 +3843,18 @@ union bpf_attr { * Get *cpu* capacity returned in *ctx*. * Return * 0 on success, or a negative error in case of failure. + * + * long bpf_init_cpu_topology(struct bpf_map *map, u64 flags) + * Description + * Initializing the cpu topology which used for bpf prog. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_get_system_cpus(struct bpf_system_cpus *cpus, int len) + * Description + * Get system cpus returned in *cpus*. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4017,6 +4029,8 @@ union bpf_attr { FN(sched_cpu_nr_running_of), \ FN(sched_cpu_idle_stat_of), \ FN(sched_cpu_capacity_of), \ + FN(init_cpu_topology), \ + FN(get_system_cpus), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
Add cpumask ops collection, such as cpumask_empty, cpumask_and, cpumask_andnot, cpumask_subset, cpumask_equal, cpumask_copy.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com --- include/linux/sched.h | 23 +++++++++ include/uapi/linux/bpf.h | 7 +++ kernel/sched/bpf_sched.c | 86 ++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 4 ++ tools/include/uapi/linux/bpf.h | 7 +++ 5 files changed, 127 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 6796437eec45..08e7e6d64f55 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2217,5 +2217,28 @@ struct bpf_sched_cpu_capacity { unsigned long capacity; unsigned long capacity_orig; }; + +struct cpumask_op_args { + unsigned int op_type; + void *arg1; + void *arg2; + void *arg3; + void *arg4; +}; + +enum cpumask_op_type { + CPUMASK_EMPTY, + CPUMASK_AND, + CPUMASK_ANDNOT, + CPUMASK_SUBSET, + CPUMASK_EQUAL, + CPUMASK_TEST_CPU, + CPUMASK_COPY, + CPUMASK_WEIGHT, + CPUMASK_NEXT, + CPUMASK_NEXT_WRAP, + CPUMASK_NEXT_AND, + CPUMASK_CPULIST_PARSE +}; #endif #endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ac8e951c9990..5f794a297c79 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3855,6 +3855,12 @@ union bpf_attr { * Get system cpus returned in *cpus*. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_cpumask_op(struct cpumask_op_args *op, int len) + * Description + * cpumask operation collections. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4031,6 +4037,7 @@ union bpf_attr { FN(sched_cpu_capacity_of), \ FN(init_cpu_topology), \ FN(get_system_cpus), \ + FN(cpumask_op), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 89845caae2f2..00f3186a011b 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -375,6 +375,90 @@ static const struct bpf_func_proto bpf_sched_cpu_capacity_of_proto = { .arg3_type = ARG_CONST_SIZE, };
+BPF_CALL_2(bpf_cpumask_op, struct cpumask_op_args *, op, int, len) +{ + int ret; + + if (len != sizeof(*op) || !op->arg1) + return -EINVAL; + + switch (op->op_type) { + case CPUMASK_EMPTY: + return cpumask_empty((const struct cpumask *)op->arg1); + case CPUMASK_AND: + if (!op->arg2 || !op->arg3) + return -EINVAL; + return cpumask_and((struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2, + (const struct cpumask *)op->arg3); + case CPUMASK_ANDNOT: + if (!op->arg2 || !op->arg3) + return -EINVAL; + cpumask_andnot((struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2, + (const struct cpumask *)op->arg3); + break; + case CPUMASK_SUBSET: + if (!op->arg2) + return -EINVAL; + return cpumask_subset((const struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2); + case CPUMASK_EQUAL: + if (!op->arg2) + return -EINVAL; + return cpumask_equal((const struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2); + case CPUMASK_TEST_CPU: + if (!op->arg2) + return -EINVAL; + return cpumask_test_cpu(*(int *)op->arg1, op->arg2); + case CPUMASK_COPY: + if (!op->arg2) + return -EINVAL; + cpumask_copy((struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2); + break; + case CPUMASK_WEIGHT: + return cpumask_weight((const struct cpumask *)op->arg1); + case CPUMASK_NEXT: + if (!op->arg2) + return -EINVAL; + return cpumask_next(*(int *)op->arg1, + (const struct cpumask *)op->arg2); + case CPUMASK_NEXT_WRAP: + if (!op->arg2 || !op->arg3 || !op->arg4) + return -EINVAL; + return cpumask_next_wrap(*(int *)op->arg1, + (const struct cpumask *)op->arg2, + *(int *)op->arg3, *(int *)op->arg4); + case CPUMASK_NEXT_AND: + if (!op->arg2 || !op->arg3) + return -EINVAL; + return cpumask_next_and(*(int *)op->arg1, + (const struct cpumask *)op->arg2, + (const struct cpumask *)op->arg3); + case CPUMASK_CPULIST_PARSE: + if (!op->arg2) + return -EINVAL; + + op->arg1 = (void *)strstrip((void *)op->arg1); + ret = cpulist_parse((void *)op->arg1, + (struct cpumask *)op->arg2); + return ret; + default: + return -EINVAL; + } + + return 0; +} + +static const struct bpf_func_proto bpf_cpumask_op_proto = { + .func = bpf_cpumask_op, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, +};
static const struct bpf_func_proto * bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) @@ -406,6 +490,8 @@ bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sched_cpu_idle_stat_of_proto; case BPF_FUNC_sched_cpu_capacity_of: return &bpf_sched_cpu_capacity_of_proto; + case BPF_FUNC_cpumask_op: + return &bpf_cpumask_op_proto; default: return bpf_base_func_proto(func_id); } diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 1a55adee2a6d..8ec9f0302dac 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -443,6 +443,8 @@ class PrinterHelpers(Printer): 'struct bpf_sched_cpu_capacity', 'struct bpf_cpu_topology', 'struct bpf_system_cpus', + 'struct cpumask', + 'struct cpumask_op_args', ] known_types = { '...', @@ -494,6 +496,8 @@ class PrinterHelpers(Printer): 'struct bpf_sched_cpu_capacity', 'struct bpf_cpu_topology', 'struct bpf_system_cpus', + 'struct cpumask', + 'struct cpumask_op_args', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 8a09dfc1dad5..7afd2958934d 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3855,6 +3855,12 @@ union bpf_attr { * Get system cpus returned in *cpus*. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_cpumask_op(struct cpumask_op_args *op, int len) + * Description + * cpumask operation collections. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4031,6 +4037,7 @@ union bpf_attr { FN(sched_cpu_capacity_of), \ FN(init_cpu_topology), \ FN(get_system_cpus), \ + FN(cpumask_op), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
From: Hui Tang tanghui20@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
Add helper function to check two cpu whehter share same LLC cache.
Signed-off-by: Hui Tang tanghui20@huawei.com --- include/uapi/linux/bpf.h | 7 +++++++ kernel/sched/bpf_sched.c | 18 ++++++++++++++++++ tools/include/uapi/linux/bpf.h | 7 +++++++ 3 files changed, 32 insertions(+)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5f794a297c79..bfd7ddc6bd54 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3861,6 +3861,12 @@ union bpf_attr { * cpumask operation collections. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_cpus_share_cache(int src_cpu, int dst_cpu) + * Description + * check src_cpu whether share cache with dst_cpu. + * Return + * true yes, false no. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4038,6 +4044,7 @@ union bpf_attr { FN(init_cpu_topology), \ FN(get_system_cpus), \ FN(cpumask_op), \ + FN(cpus_share_cache), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 00f3186a011b..d4224c80d5cf 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -335,6 +335,8 @@ BPF_CALL_3(bpf_sched_cpu_idle_stat_of, int, cpu, ctx->avg_idle = rq->avg_idle; #endif
+ ctx->available_idle = available_idle_cpu(cpu); + return 0; }
@@ -460,6 +462,20 @@ static const struct bpf_func_proto bpf_cpumask_op_proto = { .arg2_type = ARG_CONST_SIZE, };
+BPF_CALL_2(bpf_cpus_share_cache, int, src_cpu, + int, dst_cpu) +{ + return cpus_share_cache(src_cpu, dst_cpu); +} + +static const struct bpf_func_proto bpf_cpus_share_cache_proto = { + .func = bpf_cpus_share_cache, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -492,6 +508,8 @@ bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sched_cpu_capacity_of_proto; case BPF_FUNC_cpumask_op: return &bpf_cpumask_op_proto; + case BPF_FUNC_cpus_share_cache: + return &bpf_cpus_share_cache_proto; default: return bpf_base_func_proto(func_id); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7afd2958934d..a88c9b495824 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3861,6 +3861,12 @@ union bpf_attr { * cpumask operation collections. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_cpus_share_cache(int src_cpu, int dst_cpu) + * Description + * check src_cpu whether share cache with dst_cpu. + * Return + * true yes, false no. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4038,6 +4044,7 @@ union bpf_attr { FN(init_cpu_topology), \ FN(get_system_cpus), \ FN(cpumask_op), \ + FN(cpus_share_cache), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
Add lib for sched programmable, this functions help user program more easily.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com --- tools/lib/bpf/libbpf_sched.h | 435 +++++++++++++++++++++++++++++++++++ 1 file changed, 435 insertions(+) create mode 100644 tools/lib/bpf/libbpf_sched.h
diff --git a/tools/lib/bpf/libbpf_sched.h b/tools/lib/bpf/libbpf_sched.h new file mode 100644 index 000000000000..b937640bf0fc --- /dev/null +++ b/tools/lib/bpf/libbpf_sched.h @@ -0,0 +1,435 @@ +// PDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#ifndef __LIBBPF_LIBSCHED_H +#define __LIBBPF_LIBSCHED_H + +#include <linux/bpf_topology.h> +#include <linux/version.h> +#include <uapi/linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +#define INVALID_PTR ((void *)(0UL)) +#define getVal(P) \ + ({ \ + typeof(P) val = 0; \ + bpf_probe_read_kernel(&val, sizeof(val), &(P)); \ + val; \ + }) + +static __always_inline long libbpf_cpumask_next(int n, struct cpumask *mask); +static __always_inline long libbpf_cpumask_next_wrap(int n, struct cpumask *mask, int start, int wrap); +static __always_inline long libbpf_cpumask_next_and(int n, struct cpumask *mask1, struct cpumask *mask2); +static __always_inline int libbpf_nr_cpus_ids(void); +static __always_inline int libbpf_nr_cpumask_bits(void); + +#if NR_CPUS == 1 + +#define libbpf_for_each_cpu(cpu, mask) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) +#define libbpf_for_each_cpu_wrap(cpu, mask, start) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)(start)) +#define libbpf_for_each_cpu_and(cpu, mask1, mask2) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask1, (void)mask2) + +#else + +#define libbpf_for_each_cpu(cpu, mask) \ + for (int __i = 0, (cpu) = -1; \ + (cpu) = libbpf_cpumask_next((cpu), (mask)), \ + (cpu) < libbpf_nr_cpus_ids() && __i < NR_CPUS; __i++) + +#define libbpf_for_each_cpu_wrap(cpu, mask, start) \ + for (int __i = 0, (cpu) = libbpf_cpumask_next_wrap((start) - 1, (mask), (start), false); \ + (cpu) < libbpf_nr_cpumask_bits() && __i < NR_CPUS; \ + (cpu) = libbpf_cpumask_next_wrap((cpu), (mask), (start), true), __i++) + +#define libbpf_for_each_cpu_and(cpu, mask1, mask2) \ + for (int __i = 0, (cpu) = -1; \ + (cpu) = libbpf_cpumask_next_and((cpu), (mask1), (mask2)), \ + (cpu) < libbpf_nr_cpus_ids() && __i < NR_CPUS; __i++) + +#endif + +static __always_inline long libbpf_cpumask_copy(struct cpumask *dst, struct cpumask *src) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_COPY; + op.arg1 = dst; + op.arg2 = src; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_empty(struct cpumask *mask) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_EMPTY; + op.arg1 = mask; + op.arg2 = INVALID_PTR; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_and(struct cpumask *dst, struct cpumask *src1, struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_AND; + op.arg1 = dst; + op.arg2 = src1; + op.arg3 = src2; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_andnot(struct cpumask *dst, struct cpumask *src1, struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_ANDNOT; + op.arg1 = dst; + op.arg2 = src1; + op.arg3 = src2; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_subset(struct cpumask *src1, struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_SUBSET; + op.arg1 = src1; + op.arg2 = src2; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_equal(struct cpumask *src1, struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_EQUAL; + op.arg1 = src1; + op.arg2 = src2; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_weight(struct cpumask *src1) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_WEIGHT; + op.arg1 = src1; + op.arg2 = INVALID_PTR; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_test_cpu(int cpu, struct cpumask *mask) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_TEST_CPU; + op.arg1 = &cpu; + op.arg2 = mask; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_next(int n, struct cpumask *mask) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_NEXT; + op.arg1 = &n; + op.arg2 = mask; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_next_wrap(int n, struct cpumask *mask, int start, int wrap) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_NEXT_WRAP; + op.arg1 = &n; + op.arg2 = mask; + op.arg3 = &start; + op.arg4 = &wrap; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_next_and(int n, struct cpumask *mask1, struct cpumask *mask2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_NEXT_AND; + op.arg1 = &n; + op.arg2 = mask1; + op.arg3 = mask2; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_cpulist_parse(char *src1, struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_CPULIST_PARSE; + op.arg1 = src1; + op.arg2 = src2; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline int libbpf_num_active_cpus(void) +{ + struct bpf_system_cpus cpus; + + bpf_get_system_cpus(&cpus, sizeof(cpus)); + return getVal(cpus.nums_active_cpus); +} + +static __always_inline int libbpf_num_possible_cpus(void) +{ + struct bpf_system_cpus cpus; + + bpf_get_system_cpus(&cpus, sizeof(cpus)); + return getVal(cpus.nums_possible_cpus); +} + +static __always_inline void libbpf_possible_cpus_mask(struct cpumask *mask) +{ + struct bpf_system_cpus cpus; + + bpf_get_system_cpus(&cpus, sizeof(cpus)); + libbpf_cpumask_copy(mask, &cpus.cpu_possible_cpumask); +} + +static __always_inline void libbpf_active_cpus_mask(struct cpumask *mask) +{ + struct bpf_system_cpus cpus; + + bpf_get_system_cpus(&cpus, sizeof(cpus)); + libbpf_cpumask_copy(mask, &cpus.cpu_active_cpumask); +} + +static __always_inline void libbpf_isolate_cpus_mask(struct cpumask *mask) +{ + struct bpf_system_cpus cpus; + + bpf_get_system_cpus(&cpus, sizeof(cpus)); + libbpf_cpumask_copy(mask, &cpus.cpu_isolate_cpumask); +} + +static __always_inline int libbpf_nr_cpus_ids(void) +{ + struct bpf_system_cpus cpus; + + bpf_get_system_cpus(&cpus, sizeof(cpus)); + return getVal(cpus.nr_cpu_ids); +} + +static __always_inline int libbpf_nr_cpumask_bits(void) +{ + struct bpf_system_cpus cpus; + + bpf_get_system_cpus(&cpus, sizeof(cpus)); + return getVal(cpus.bpf_nr_cpumask_bits); +} + +static __always_inline unsigned long libbpf_cfs_load_avg_of(int cpu) +{ + struct bpf_sched_cpu_load load; + + bpf_sched_cpu_load_of(cpu, &load, sizeof(load)); + return getVal(load.cfs_load_avg); +} + +static __always_inline unsigned long libbpf_cfs_runnable_avg_of(int cpu) +{ + struct bpf_sched_cpu_load load; + + bpf_sched_cpu_load_of(cpu, &load, sizeof(load)); + return getVal(load.cfs_runnable_avg); +} + +static __always_inline unsigned long libbpf_cfs_util_avg_of(int cpu) +{ + struct bpf_sched_cpu_load load; + + bpf_sched_cpu_load_of(cpu, &load, sizeof(load)); + return getVal(load.cfs_util_avg); +} + +static __always_inline unsigned long libbpf_rt_load_avg_of(int cpu) +{ + struct bpf_sched_cpu_load load; + + bpf_sched_cpu_load_of(cpu, &load, sizeof(load)); + return load.rt_load_avg; +} + +static __always_inline unsigned long libbpf_rt_runnable_avg_of(int cpu) +{ + struct bpf_sched_cpu_load load; + + bpf_sched_cpu_load_of(cpu, &load, sizeof(load)); + return load.rt_runnable_avg; +} + +static __always_inline unsigned long libbpf_rt_util_avg_of(int cpu) +{ + struct bpf_sched_cpu_load load; + + bpf_sched_cpu_load_of(cpu, &load, sizeof(load)); + return load.rt_util_avg; +} + +static __always_inline unsigned long libbpf_irq_load_avg_of(int cpu) +{ + struct bpf_sched_cpu_load load; + + bpf_sched_cpu_load_of(cpu, &load, sizeof(load)); + return load.irq_load_avg; +} + +static __always_inline unsigned long libbpf_irq_util_avg_of(int cpu) +{ + struct bpf_sched_cpu_load load; + + bpf_sched_cpu_load_of(cpu, &load, sizeof(load)); + return load.irq_util_avg; +} + +static __always_inline unsigned int libbpf_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_nr_running running; + + bpf_sched_cpu_nr_running_of(cpu, &running, sizeof(running)); + return getVal(running.nr_running); +} + +static __always_inline unsigned int libbpf_cfs_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_nr_running running; + + bpf_sched_cpu_nr_running_of(cpu, &running, sizeof(running)); + return getVal(running.cfs_nr_running); +} + +static __always_inline unsigned int libbpf_cfs_h_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_nr_running running; + + bpf_sched_cpu_nr_running_of(cpu, &running, sizeof(running)); + return getVal(running.cfs_h_nr_runing); +} + +static __always_inline unsigned int libbpf_cfs_idle_h_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_nr_running running; + + bpf_sched_cpu_nr_running_of(cpu, &running, sizeof(running)); + return running.cfs_idle_h_nr_running; +} + +static __always_inline unsigned int libbpf_rt_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_nr_running running; + + bpf_sched_cpu_nr_running_of(cpu, &running, sizeof(running)); + return getVal(running.rt_nr_running); +} + +static __always_inline unsigned int libbpf_rr_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_nr_running running; + + bpf_sched_cpu_nr_running_of(cpu, &running, sizeof(running)); + return running.rr_nr_running; +} + +static __always_inline unsigned int libbpf_exit_latency_of(int cpu) +{ + struct bpf_sched_cpu_idle_stat stat; + + bpf_sched_cpu_idle_stat_of(cpu, &stat, sizeof(stat)); + return stat.exit_latency; +} + +static __always_inline unsigned long libbpf_idle_stamp_of(int cpu) +{ + struct bpf_sched_cpu_idle_stat stat; + + bpf_sched_cpu_idle_stat_of(cpu, &stat, sizeof(stat)); + return stat.idle_stamp; +} + +static __always_inline unsigned long libbpf_avg_idle_of(int cpu) +{ + struct bpf_sched_cpu_idle_stat stat; + + bpf_sched_cpu_idle_stat_of(cpu, &stat, sizeof(stat)); + return stat.avg_idle; +} + +static __always_inline unsigned long libbpf_available_idle_cpu(int cpu) +{ + struct bpf_sched_cpu_idle_stat stat; + + bpf_sched_cpu_idle_stat_of(cpu, &stat, sizeof(stat)); + return getVal(stat.available_idle); +} + +static __always_inline unsigned long libbpf_capacity_of(int cpu) +{ + struct bpf_sched_cpu_capacity cap; + + bpf_sched_cpu_capacity_of(cpu, &cap, sizeof(cap)); + return getVal(cap.capacity); +} + +static __always_inline unsigned long libbpf_capacity_orig_of(int cpu) +{ + struct bpf_sched_cpu_capacity cap; + + bpf_sched_cpu_capacity_of(cpu, &cap, sizeof(cap)); + return cap.capacity_orig; +} + +static __always_inline int libbpf_cpus_share_cache(int src_cpu, int dst_cpu) +{ + return bpf_cpus_share_cache(src_cpu, dst_cpu); +} + +static __always_inline void libbpf_sched_set_cpus_ptr(struct sched_migrate_ctx *c, + struct cpumask *cpus) +{ + bpf_sched_set_cpus_ptr(c, cpus, sizeof(*cpus)); +} +#endif
From: Chen Hui judy.chenhui@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
Add hook for select_task_rq_fair.
Signed-off-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Hui Tang tanghui20@huawei.com --- include/linux/sched.h | 20 +++++++++++++ include/linux/sched_hook_defs.h | 3 ++ kernel/sched/fair.c | 50 +++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 2 ++ 4 files changed, 75 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 08e7e6d64f55..700a80d3f6d6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2240,5 +2240,25 @@ enum cpumask_op_type { CPUMASK_NEXT_AND, CPUMASK_CPULIST_PARSE }; + +struct sched_migrate_ctx { + struct task_struct *task; + struct cpumask *cpus_allowed; + struct cpumask *select_idle_mask; + int prev_cpu; + int curr_cpu; + int is_sync; + int want_affine; + int wake_flags; + int sd_flag; + int new_cpu; +}; + +struct sched_affine_ctx { + struct task_struct *task; + int prev_cpu; + int curr_cpu; + int is_sync; +}; #endif #endif diff --git a/include/linux/sched_hook_defs.h b/include/linux/sched_hook_defs.h index e2f65e4b8895..9f87609cde8a 100644 --- a/include/linux/sched_hook_defs.h +++ b/include/linux/sched_hook_defs.h @@ -3,3 +3,6 @@ BPF_SCHED_HOOK(int, 0, cfs_check_preempt_tick, struct sched_entity *curr, unsign BPF_SCHED_HOOK(int, 0, cfs_check_preempt_wakeup, struct task_struct *curr, struct task_struct *p) BPF_SCHED_HOOK(int, 0, cfs_wakeup_preempt_entity, struct sched_entity *curr, struct sched_entity *se) +BPF_SCHED_HOOK(int, -1, cfs_select_rq, struct sched_migrate_ctx *ctx) +BPF_SCHED_HOOK(int, -1, cfs_wake_affine, struct sched_affine_ctx *ctx) +BPF_SCHED_HOOK(int, 0, cfs_select_rq_exit, struct sched_migrate_ctx *ctx) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3395d102b43e..79e0a475599a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6020,6 +6020,22 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, { int target = nr_cpumask_bits;
+#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + struct sched_affine_ctx ctx; + int ret; + + ctx.task = p; + ctx.prev_cpu = prev_cpu; + ctx.curr_cpu = this_cpu; + ctx.is_sync = sync; + + ret = bpf_sched_cfs_wake_affine(&ctx); + if (ret >= 0 && ret < nr_cpumask_bits) + return ret; + } +#endif + if (sched_feat(WA_IDLE)) target = wake_affine_idle(this_cpu, prev_cpu, sync);
@@ -6884,6 +6900,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int new_cpu = prev_cpu; int want_affine = 0; int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); +#ifdef CONFIG_BPF_SCHED + struct sched_migrate_ctx ctx; + int ret; +#endif
time = schedstat_start_time();
@@ -6901,6 +6921,26 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f }
rcu_read_lock(); +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + ctx.task = p; + ctx.prev_cpu = prev_cpu; + ctx.curr_cpu = cpu; + ctx.is_sync = sync; + ctx.wake_flags = wake_flags; + ctx.want_affine = want_affine; + ctx.sd_flag = sd_flag; + ctx.cpus_allowed = (void *)p->cpus_ptr; + ctx.select_idle_mask = this_cpu_cpumask_var_ptr(select_idle_mask); + + ret = bpf_sched_cfs_select_rq(&ctx); + if (ret >= 0) { + rcu_read_unlock(); + return ret; + } + } +#endif + for_each_domain(cpu, tmp) { /* * If both 'cpu' and 'prev_cpu' are part of this domain, @@ -6932,6 +6972,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (want_affine) current->recent_used_cpu = cpu; } + +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + ctx.new_cpu = new_cpu; + ret = bpf_sched_cfs_select_rq_exit(&ctx); + if (ret >= 0) + new_cpu = ret; + } +#endif + rcu_read_unlock(); schedstat_end_time(cpu_rq(cpu), time);
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 8ec9f0302dac..c8e87fae7b17 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -445,6 +445,7 @@ class PrinterHelpers(Printer): 'struct bpf_system_cpus', 'struct cpumask', 'struct cpumask_op_args', + 'struct sched_migrate_ctx', ] known_types = { '...', @@ -498,6 +499,7 @@ class PrinterHelpers(Printer): 'struct bpf_system_cpus', 'struct cpumask', 'struct cpumask_op_args', + 'struct sched_migrate_ctx', } mapped_types = { 'u8': '__u8',
From: Hui Tang tanghui20@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
Add helper funciton to set cpus_ptr in task.
Signed-off-by: Hui Tang tanghui20@huawei.com --- include/uapi/linux/bpf.h | 7 +++++++ kernel/sched/bpf_sched.c | 23 +++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 7 +++++++ 3 files changed, 37 insertions(+)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bfd7ddc6bd54..8cc17cbac415 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3867,6 +3867,12 @@ union bpf_attr { * check src_cpu whether share cache with dst_cpu. * Return * true yes, false no. + * + * int bpf_sched_set_cpus_ptr(struct sched_migrate_ctx *c, struct cpumask *cpus, int len) + * Description + * set cpus_ptr in task. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4045,6 +4051,7 @@ union bpf_attr { FN(get_system_cpus), \ FN(cpumask_op), \ FN(cpus_share_cache), \ + FN(sched_set_cpus_ptr), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index d4224c80d5cf..18ed8bce35e1 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -476,6 +476,27 @@ static const struct bpf_func_proto bpf_cpus_share_cache_proto = { .arg2_type = ARG_ANYTHING, };
+BPF_CALL_3(bpf_sched_set_cpus_ptr, struct sched_migrate_ctx *, c, + struct cpumask *, cpus, int, len) +{ + if (len != sizeof(*cpus)) + return -EINVAL; + + c->task->cpus_ptr = cpus; + return 0; +} + +BTF_ID_LIST_SINGLE(bpf_sched_migrate_ctx_ids, struct, sched_migrate_ctx) + +static const struct bpf_func_proto bpf_sched_set_cpus_ptr_proto = { + .func = bpf_sched_set_cpus_ptr, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_sched_migrate_ctx_ids[0], + .arg2_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -508,6 +529,8 @@ bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sched_cpu_capacity_of_proto; case BPF_FUNC_cpumask_op: return &bpf_cpumask_op_proto; + case BPF_FUNC_sched_set_cpus_ptr: + return &bpf_sched_set_cpus_ptr_proto; case BPF_FUNC_cpus_share_cache: return &bpf_cpus_share_cache_proto; default: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a88c9b495824..f3206757cedf 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3867,6 +3867,12 @@ union bpf_attr { * check src_cpu whether share cache with dst_cpu. * Return * true yes, false no. + * + * int bpf_sched_set_cpus_ptr(struct sched_migrate_ctx *c, struct cpumask *cpus, int len) + * Description + * set cpus_ptr in task. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4045,6 +4051,7 @@ union bpf_attr { FN(get_system_cpus), \ FN(cpumask_op), \ FN(cpus_share_cache), \ + FN(sched_set_cpus_ptr), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
From: Hui Tang tanghui20@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
1.Samples support hook of 'cfs_select_rq' 2.Samples support hook of 'cfs_wake_affine' 3.Samples support hook of 'cfs_select_exit'
Signed-off-by: Hui Tang tanghui20@huawei.com --- samples/bpf/Makefile | 3 + samples/bpf/sched_select_core_kern.c | 239 +++++++++++++++++++++++++++ samples/bpf/sched_select_core_user.c | 114 +++++++++++++ 3 files changed, 356 insertions(+) create mode 100644 samples/bpf/sched_select_core_kern.c create mode 100644 samples/bpf/sched_select_core_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index e473bad76549..62dadae992a2 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -55,6 +55,7 @@ tprogs-y += xdp_sample_pkts tprogs-y += ibumad tprogs-y += hbm tprogs-y += sched_preempt +tprogs-y += sched_select_core
# Libbpf dependencies LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a @@ -113,6 +114,7 @@ xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS) ibumad-objs := bpf_load.o ibumad_user.o $(TRACE_HELPERS) hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS) sched_preempt-objs := sched_preempt_user.o +sched_select_core-objs := sched_select_core_user.o
# Tell kbuild to always build the programs always-y := $(tprogs-y) @@ -175,6 +177,7 @@ always-y += hbm_out_kern.o always-y += hbm_edt_kern.o always-y += xdpsock_kern.o always-y += sched_preempt_kern.o +always-y += sched_select_core_kern.o
ifeq ($(ARCH), arm) # Strip all except -D__LINUX_ARM_ARCH__ option needed to handle linux diff --git a/samples/bpf/sched_select_core_kern.c b/samples/bpf/sched_select_core_kern.c new file mode 100644 index 000000000000..3a01899819f5 --- /dev/null +++ b/samples/bpf/sched_select_core_kern.c @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2022 Huawei + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/version.h> +#include <linux/sched.h> +#include <uapi/linux/bpf.h> +#include <bpf/bpf_core_read.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/libbpf_sched.h> +#include <linux/cpumask.h> + +#define STR_MAX (32) +#define SELECT_RQ_RANGE (-1) +#define SELECT_RQ_EXIT_CPU_VALID (-2) + +/* From kernel/sched/sched.h */ +#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ +#define WF_FORK 0x02 /* Child wakeup after fork */ +#define WF_MIGRATED 0x04 /* Internal use, task got migrated */ +#define WF_ON_CPU 0x08 /* Wakee is on_cpu */ + +#define TAG_ID(id) TAG_##id + +enum tag_id { + TAG_NONE, + TAG_ID(1), + TAG_ID(2), + TAG_MAX +}; + +struct tag_info { + long tag; + char buf[STR_MAX]; +}; + +struct tag_info tag_tbl[] = { + {TAG_NONE, ""}, + {TAG_ID(1), "0-3"}, + {TAG_ID(2), "4-7"}, + {TAG_MAX, ""}, +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, int); + __uint(max_entries, 1); +} map_idlest_cpu SEC(".maps"); + +int sysctl_sched_util_low_pct = 85; + +static inline bool prefer_cpus_valid(struct cpumask *prefer_cpus, + struct cpumask *cpus_allowed) +{ + return !libbpf_cpumask_empty(prefer_cpus) && + !libbpf_cpumask_equal(prefer_cpus, cpus_allowed) && + libbpf_cpumask_subset(prefer_cpus, cpus_allowed); +} + +static struct cpumask *get_better_cpus(struct task_struct *p, + struct cpumask *prefer_cpus, + int *idlest_cpu) +{ + unsigned long util_avg_sum = 0; + unsigned long tg_capacity = 0; + unsigned int weight; + long min_util = INT_MIN; + struct task_group *tg; + long spare; + int cpu; + + if (!prefer_cpus_valid(prefer_cpus, (void *)getVal(p->cpus_ptr))) + return (void *)getVal(p->cpus_ptr); + + tg = p->sched_task_group; + libbpf_for_each_cpu(cpu, prefer_cpus) { + if (idlest_cpu && libbpf_available_idle_cpu(cpu)) { + *idlest_cpu = cpu; + } else if (idlest_cpu) { + spare = (long)(libbpf_capacity_of(cpu) - libbpf_cfs_util_avg_of(cpu)); + if (spare > min_util) { + min_util = spare; + *idlest_cpu = cpu; + } + } + + if (libbpf_available_idle_cpu(cpu)) { + return getVal(prefer_cpus); + } + + util_avg_sum += libbpf_cfs_util_avg_of(cpu); + tg_capacity += libbpf_capacity_of(cpu); + } + + weight = libbpf_cpumask_weight(prefer_cpus); + if (tg_capacity > weight && + util_avg_sum * 100 <= tg_capacity * sysctl_sched_util_low_pct) { + return getVal(prefer_cpus); + } + + return (void *)getVal(p->cpus_ptr); +} + +SEC("sched/cfs_select_rq") +int BPF_PROG(cfs_select_cpu_range, struct sched_migrate_ctx *c) +{ + struct cpumask *prefer_cpus = getVal(c->select_idle_mask); + struct task_struct *p = getVal(c->task); + struct cpumask *cpus_ptr; + int type = SELECT_RQ_RANGE; + long tag = getVal(p->tag); + int *idlest_cpu = 0; + int key = 0; + int ret; + + if (tag <= TAG_NONE || tag >= TAG_MAX) + return type; + + ret = libbpf_cpumask_cpulist_parse(tag_tbl[tag].buf, prefer_cpus); + if (ret) + return type; + + idlest_cpu = bpf_map_lookup_elem(&map_idlest_cpu, &key); + if (!idlest_cpu) + return type; + + cpus_ptr = get_better_cpus(p, prefer_cpus, idlest_cpu); + libbpf_sched_set_cpus_ptr((void *)c, getVal(cpus_ptr)); + + return type; +} + +SEC("sched/cfs_select_rq_exit") +int BPF_PROG(cfs_select_cpu_range_exit, struct sched_migrate_ctx *c) +{ + int *idlest_cpu; + int key = 0; + + idlest_cpu = bpf_map_lookup_elem(&map_idlest_cpu, &key); + if (!idlest_cpu) { + libbpf_sched_set_cpus_ptr(c, (void *)getVal(c->cpus_allowed)); + return SELECT_RQ_EXIT_CPU_VALID; + } + + if (!libbpf_cpumask_test_cpu(getVal(c->new_cpu), + (void *)getVal(c->task->cpus_ptr))) { + libbpf_sched_set_cpus_ptr(c, (void *)getVal(c->cpus_allowed)); + return *idlest_cpu; + } + + libbpf_sched_set_cpus_ptr(c, (void *)getVal(c->cpus_allowed)); + return SELECT_RQ_EXIT_CPU_VALID; +} + +static int find_idlest_cpu(struct task_struct *p, int parent) +{ + unsigned long min = INT_MAX; + int min_load_cpu = 0; + unsigned long load; + int cpu; + int i; + + for (i = 0, cpu = -1; i < NR_CPUS; i++) { + cpu = libbpf_cpumask_next(cpu, (void *)getVal(p->cpus_ptr)); + if (cpu >= libbpf_nr_cpus_ids()) + break; + + load = libbpf_cfs_load_avg_of(cpu); + if (load < min) { + min = load; + min_load_cpu = cpu; + } + } + + return min_load_cpu; +} + +static int select_idle_cpu(struct task_struct *p, int parent, int prev_cpu) +{ + int cpu; + + if (libbpf_available_idle_cpu(prev_cpu)) + return prev_cpu; + + if (libbpf_available_idle_cpu(parent)) + return prev_cpu; + + libbpf_for_each_cpu_wrap(cpu, (void *)getVal(p->cpus_ptr), prev_cpu) { + if (libbpf_available_idle_cpu(cpu)) + return cpu; + } + + return prev_cpu; +} + +SEC("sched/cfs_select_rq") +int BPF_PROG(cfs_select_cpu, struct sched_migrate_ctx *c) +{ + struct task_struct *p = getVal(c->task); + int wake_flags = getVal(c->wake_flags); + int prev_cpu = getVal(c->prev_cpu); + int cpu = getVal(c->curr_cpu); + int new_cpu; + + if (wake_flags == WF_FORK) { + /* Slow path */ + new_cpu = find_idlest_cpu(p, cpu); + } else { + /* Fast path */ + new_cpu = select_idle_cpu(p, cpu, prev_cpu); + } + + return new_cpu; +} + +SEC("sched/cfs_wake_affine") +int BPF_PROG(cfs_wake_affine, struct sched_affine_ctx *c) +{ + int prev_cpu = getVal(c->prev_cpu); + int curr_cpu = getVal(c->curr_cpu); + int sync = getVal(c->is_sync); + + if (libbpf_available_idle_cpu(curr_cpu) && + libbpf_cpus_share_cache(curr_cpu, prev_cpu)) + return libbpf_available_idle_cpu(prev_cpu) ? prev_cpu : curr_cpu; + + if (sync && libbpf_nr_running_of(curr_cpu) == 1) + return curr_cpu; + + return prev_cpu; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/sched_select_core_user.c b/samples/bpf/sched_select_core_user.c new file mode 100644 index 000000000000..2cfe7a6223f1 --- /dev/null +++ b/samples/bpf/sched_select_core_user.c @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2022 Huawei +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <unistd.h> +#include <sys/resource.h> +#include <bpf/libbpf.h> + +static void usage(void) +{ + printf("USAGE: test sched select core [...]\n"); + printf(" -W wakeup affine # Test sched wake wakeup\n"); + printf(" -C select core # Test sched select core\n"); + printf(" -R select core range # Test sched select core range\n"); + printf(" -h # Display this help\n"); +} + +#define TRACE_DIR "/sys/kernel/debug/tracing/" +#define BUF_SIZE (4096) + +/* read trace logs from debug fs */ +static void read_trace_pipe(void) +{ + int trace_fd; + + trace_fd = open(TRACE_DIR "trace_pipe", O_RDONLY, 0); + if (trace_fd < 0) + return; + + while (1) { + static char buf[BUF_SIZE]; + ssize_t sz; + + sz = read(trace_fd, buf, sizeof(buf) - 1); + if (sz > 0) { + buf[sz] = 0; + puts(buf); + } + } +} + +int main(int argc, char **argv) +{ + int opt; + char filename[256]; + char progname[4][256]; + struct bpf_object *obj; + struct bpf_program *prog[4] = {NULL}; + struct bpf_link *link[4] = {NULL}; + int prog_num = 1; + int i = 0; + + while ((opt = getopt(argc, argv, "C::R::W::E::")) != -1) { + switch (opt) { + case 'C': + snprintf(progname[0], sizeof(progname[0]), "cfs_select_cpu"); + break; + case 'R': + snprintf(progname[0], sizeof(progname[0]), "cfs_select_cpu_range"); + snprintf(progname[1], sizeof(progname[1]), "cfs_select_cpu_range_exit"); + prog_num = 2; + break; + case 'W': + snprintf(progname[0], sizeof(progname[0]), "cfs_wake_affine"); + break; + default: + usage(); + goto out; + } + } + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + goto out; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + for (i = 0; i < prog_num; i++) { + prog[i] = bpf_object__find_program_by_name(obj, progname[i]); + if (libbpf_get_error(prog[i])) { + fprintf(stderr, "ERROR: finding a prog %d in obj file failed\n", i); + goto cleanup; + } + + link[i] = bpf_program__attach(prog[i]); + if (libbpf_get_error(link[i])) { + fprintf(stderr, "ERROR: bpf_program__attach %d failed\n", i); + link[i] = NULL; + goto cleanup; + } + } + + printf("select rq BPF started, hit Ctrl+C to stop!\n"); + + read_trace_pipe(); + +cleanup: + for (; i >= 0; i--) + bpf_link__destroy(link[i]); + bpf_object__close(obj); +out: + return 0; +}
From: Guan Jing guanjing6@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
Add a hook function:sched_cfs_tag_pick_next_entity for entity_before.
Signed-off-by: Guan Jing guanjing6@huawei.com --- include/linux/sched_hook_defs.h | 4 +++- kernel/sched/fair.c | 8 ++++++++ 2 files changed, 11 insertions(+), 1 deletion(-)
diff --git a/include/linux/sched_hook_defs.h b/include/linux/sched_hook_defs.h index 9f87609cde8a..8b4db585d31f 100644 --- a/include/linux/sched_hook_defs.h +++ b/include/linux/sched_hook_defs.h @@ -2,7 +2,9 @@ BPF_SCHED_HOOK(int, 0, cfs_check_preempt_tick, struct sched_entity *curr, unsigned long delta_exec) BPF_SCHED_HOOK(int, 0, cfs_check_preempt_wakeup, struct task_struct *curr, struct task_struct *p) BPF_SCHED_HOOK(int, 0, cfs_wakeup_preempt_entity, struct sched_entity *curr, - struct sched_entity *se) + struct sched_entity *se) BPF_SCHED_HOOK(int, -1, cfs_select_rq, struct sched_migrate_ctx *ctx) BPF_SCHED_HOOK(int, -1, cfs_wake_affine, struct sched_affine_ctx *ctx) BPF_SCHED_HOOK(int, 0, cfs_select_rq_exit, struct sched_migrate_ctx *ctx) +BPF_SCHED_HOOK(int, 0, cfs_tag_pick_next_entity, struct sched_entity *curr, + struct sched_entity *next) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 79e0a475599a..b192a9873091 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -513,6 +513,14 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) static inline int entity_before(struct sched_entity *a, struct sched_entity *b) { +#ifdef CONFIG_BPF_SCHED + if(bpf_sched_enabled()) { + int ret = bpf_sched_cfs_tag_pick_next_entity(a, b); + if(ret == 1) + return 1; /* pick online se */ + } +#endif + return (s64)(a->vruntime - b->vruntime) < 0; }
From: Guan Jing guanjing6@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
Add bpf sched pick task sample.
Signed-off-by: Guan Jing guanjing6@huawei.com --- samples/bpf/Makefile | 3 + samples/bpf/sched_pick_task_kern.c | 62 ++++++++++++++++++++ samples/bpf/sched_pick_task_user.c | 92 ++++++++++++++++++++++++++++++ 3 files changed, 157 insertions(+) create mode 100644 samples/bpf/sched_pick_task_kern.c create mode 100644 samples/bpf/sched_pick_task_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 62dadae992a2..1d92e87565ad 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -56,6 +56,7 @@ tprogs-y += ibumad tprogs-y += hbm tprogs-y += sched_preempt tprogs-y += sched_select_core +tprogs-y += sched_pick_task
# Libbpf dependencies LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a @@ -115,6 +116,7 @@ ibumad-objs := bpf_load.o ibumad_user.o $(TRACE_HELPERS) hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS) sched_preempt-objs := sched_preempt_user.o sched_select_core-objs := sched_select_core_user.o +sched_pick_task-objs := sched_pick_task_user.o
# Tell kbuild to always build the programs always-y := $(tprogs-y) @@ -178,6 +180,7 @@ always-y += hbm_edt_kern.o always-y += xdpsock_kern.o always-y += sched_preempt_kern.o always-y += sched_select_core_kern.o +always-y += sched_pick_task_kern.o
ifeq ($(ARCH), arm) # Strip all except -D__LINUX_ARM_ARCH__ option needed to handle linux diff --git a/samples/bpf/sched_pick_task_kern.c b/samples/bpf/sched_pick_task_kern.c new file mode 100644 index 000000000000..5cd09e26f806 --- /dev/null +++ b/samples/bpf/sched_pick_task_kern.c @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#include <linux/version.h> +#include <linux/sched.h> +#include <uapi/linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +#define PICK_CURR 1; +#define PICK_NOMAL 0; +#define ERROR -1; + +enum task_type { + TASK_TYPE_OFFLINE = -1, + TASK_TYPE_ONLINE, + TASK_TYPE_MAX +}; + +int querry_se_tag(struct sched_entity *se) +{ + int se_tag = TASK_TYPE_ONLINE; + + if (bpf_sched_entity_is_task(se)) { + struct task_struct *task = bpf_sched_entity_to_task(se); + se_tag = bpf_sched_task_tag_of(task); + } else { + struct task_group *tg = bpf_sched_entity_to_tg(se); + se_tag = bpf_sched_tg_tag_of(tg); + } + + return se_tag; +} + +SEC("sched/cfs_tag_pick_next_entity") +int BPF_PROG(sched_cfs_tag_pick_next_entity, struct sched_entity *curr, struct sched_entity *next) +{ + int curr_type = 0; + int next_type = 0; + + if (curr == NULL || next == NULL) + return PICK_NOMAL; + + curr_type = querry_se_tag(curr); + next_type = querry_se_tag(next); + + if (curr_type > next_type) + return PICK_CURR; + + return PICK_NOMAL; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/sched_pick_task_user.c b/samples/bpf/sched_pick_task_user.c new file mode 100644 index 000000000000..316ff8a601e1 --- /dev/null +++ b/samples/bpf/sched_pick_task_user.c @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <unistd.h> +#include <sys/resource.h> +#include <bpf/libbpf.h> + +#define TRACE_DIR "/sys/kernel/debug/tracing/" +#define BUF_SIZE (4096) + +/* read trace logs from debug fs */ +void read_trace_pipe(void) +{ + int trace_fd; + + trace_fd = open(TRACE_DIR "trace_pipe", O_RDONLY, 0); + if (trace_fd < 0) + return; + + while (1) { + static char buf[BUF_SIZE]; + ssize_t sz; + + sz = read(trace_fd, buf, sizeof(buf) - 1); + if (sz > 0) { + buf[sz] = 0; + puts(buf); + } + } +} + +int main(int argc, char **argv) +{ + char filename[256]; + struct bpf_object *obj; + struct bpf_program *prog; + struct bpf_link *link; + int err; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + /* Open BPF application */ + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 1; + } + + /* Load and verify BPF program */ + err = bpf_object__load(obj); + if (err) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + prog = bpf_object__find_program_by_name(obj, "sched_cfs_tag_pick_next_entity"); + if (libbpf_get_error(prog)) { + fprintf(stderr, "ERROR: finding a prog in obj file failed\n"); + goto cleanup; + } + + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + link = NULL; + goto cleanup; + } + + printf("preempt BPF started, hit Ctrl+C to stop!\n"); + + read_trace_pipe(); + +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); +out: + return 0; +}