From: Hui Tang tanghui20@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5KUFB CVE: NA
--------------------------------
1.Samples support hook of 'cfs_select_rq' 2.Samples support hook of 'cfs_wake_affine' 3.Samples support hook of 'cfs_select_exit'
Signed-off-by: Hui Tang tanghui20@huawei.com --- samples/bpf/Makefile | 3 + samples/bpf/sched_select_core_kern.c | 259 +++++++++++++++++++++++++++ samples/bpf/sched_select_core_user.c | 125 +++++++++++++ 3 files changed, 387 insertions(+) create mode 100644 samples/bpf/sched_select_core_kern.c create mode 100644 samples/bpf/sched_select_core_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index e473bad76549..62dadae992a2 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -55,6 +55,7 @@ tprogs-y += xdp_sample_pkts tprogs-y += ibumad tprogs-y += hbm tprogs-y += sched_preempt +tprogs-y += sched_select_core
# Libbpf dependencies LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a @@ -113,6 +114,7 @@ xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS) ibumad-objs := bpf_load.o ibumad_user.o $(TRACE_HELPERS) hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS) sched_preempt-objs := sched_preempt_user.o +sched_select_core-objs := sched_select_core_user.o
# Tell kbuild to always build the programs always-y := $(tprogs-y) @@ -175,6 +177,7 @@ always-y += hbm_out_kern.o always-y += hbm_edt_kern.o always-y += xdpsock_kern.o always-y += sched_preempt_kern.o +always-y += sched_select_core_kern.o
ifeq ($(ARCH), arm) # Strip all except -D__LINUX_ARM_ARCH__ option needed to handle linux diff --git a/samples/bpf/sched_select_core_kern.c b/samples/bpf/sched_select_core_kern.c new file mode 100644 index 000000000000..18617e89b395 --- /dev/null +++ b/samples/bpf/sched_select_core_kern.c @@ -0,0 +1,259 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Sample select core BPF program. + * 'cfs_select_rq' + * Replace the original core selection policy or + * implement dynamic CPU affinity. + * + * 'cfs_select_rq_exit' + * Restoring the CPU affinity of the task before exiting of + * 'select_task_rq_fair'. + * + * To be used with 'cfs_select_rq' hook to implement + * dynamic CPU affinity. + * + * 'cfs_wake_affine' + * Determine on which CPU task can run soonest. Allow user to + * implement deferent policies. + */ +#include <linux/version.h> +#include <linux/sched.h> +#include <uapi/linux/bpf.h> +#include <bpf/bpf_core_read.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/libbpf_sched.h> +#include <linux/cpumask.h> + +#define STR_MAX (32) +#define SELECT_RQ_RANGE (-1) +#define SELECT_RQ_EXIT_CPU_VALID (-2) + +/* From kernel/sched/sched.h */ +#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ +#define WF_FORK 0x02 /* Child wakeup after fork */ +#define WF_MIGRATED 0x04 /* Internal use, task got migrated */ +#define WF_ON_CPU 0x08 /* Wakee is on_cpu */ + +#define TAG_ID(id) TAG_##id + +enum tag_id { + TAG_NONE, + TAG_ID(1), + TAG_ID(2), + TAG_MAX +}; + +struct tag_info { + long tag; + char buf[STR_MAX]; +}; + +struct tag_info tag_tbl[] = { + {TAG_NONE, ""}, + {TAG_ID(1), "0-3"}, + {TAG_ID(2), "4-7"}, + {TAG_MAX, ""}, +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, int); + __uint(max_entries, 1); +} map_idlest_cpu SEC(".maps"); + +int sysctl_sched_util_low_pct = 85; + +static inline bool prefer_cpus_valid(struct cpumask *prefer_cpus, + struct cpumask *cpus_allowed) +{ + return !libbpf_cpumask_empty(prefer_cpus) && + !libbpf_cpumask_equal(prefer_cpus, cpus_allowed) && + libbpf_cpumask_subset(prefer_cpus, cpus_allowed); +} + +static struct cpumask *select_better_cpus(struct task_struct *p, + struct cpumask *prefer_cpus, + int *idlest_cpu) +{ + unsigned long util_avg_sum = 0; + unsigned long tg_capacity = 0; + unsigned int weight; + long min_util = INT_MIN; + struct task_group *tg; + long spare; + int cpu; + + if (!prefer_cpus_valid(prefer_cpus, (void *)getVal(p->cpus_ptr))) + return (void *)getVal(p->cpus_ptr); + + tg = p->sched_task_group; + libbpf_for_each_cpu(cpu, prefer_cpus) { + if (idlest_cpu && libbpf_available_idle_cpu(cpu)) { + *idlest_cpu = cpu; + } else if (idlest_cpu) { + spare = (long)(libbpf_capacity_of(cpu) - libbpf_cfs_util_avg_of(cpu)); + if (spare > min_util) { + min_util = spare; + *idlest_cpu = cpu; + } + } + + if (libbpf_available_idle_cpu(cpu)) + return getVal(prefer_cpus); + + util_avg_sum += libbpf_cfs_util_avg_of(cpu); + tg_capacity += libbpf_capacity_of(cpu); + } + + weight = libbpf_cpumask_weight(prefer_cpus); + if (tg_capacity > weight && + util_avg_sum * 100 <= tg_capacity * sysctl_sched_util_low_pct) { + return getVal(prefer_cpus); + } + + return (void *)getVal(p->cpus_ptr); +} + +SEC("sched/cfs_select_rq") +int BPF_PROG(cfs_select_cpu_range, struct sched_migrate_ctx *h_ctx) +{ + struct cpumask *prefer_cpus = getVal(h_ctx->select_idle_mask); + struct task_struct *p = getVal(h_ctx->task); + struct cpumask *cpus_ptr; + int type = SELECT_RQ_RANGE; + long tag = getVal(p->tag); + int *idlest_cpu = 0; + int key = 0; + int ret; + + if (tag <= TAG_NONE || tag >= TAG_MAX) + return type; + + ret = libbpf_cpumask_cpulist_parse(tag_tbl[tag].buf, prefer_cpus); + if (ret) + return type; + + idlest_cpu = bpf_map_lookup_elem(&map_idlest_cpu, &key); + if (!idlest_cpu) + return type; + + cpus_ptr = select_better_cpus(p, prefer_cpus, idlest_cpu); + libbpf_sched_set_task_cpus_ptr((void *)h_ctx, getVal(cpus_ptr)); + + return type; +} + +SEC("sched/cfs_select_rq_exit") +int BPF_PROG(cfs_select_cpu_range_exit, struct sched_migrate_ctx *h_ctx) +{ + int *idlest_cpu; + int key = 0; + + idlest_cpu = bpf_map_lookup_elem(&map_idlest_cpu, &key); + if (!idlest_cpu) { + libbpf_sched_set_task_cpus_ptr(h_ctx, (void *)getVal(h_ctx->cpus_allowed)); + return SELECT_RQ_EXIT_CPU_VALID; + } + + if (!libbpf_cpumask_test_cpu(getVal(h_ctx->new_cpu), + (void *)getVal(h_ctx->task->cpus_ptr))) { + libbpf_sched_set_task_cpus_ptr(h_ctx, (void *)getVal(h_ctx->cpus_allowed)); + return *idlest_cpu; + } + + libbpf_sched_set_task_cpus_ptr(h_ctx, (void *)getVal(h_ctx->cpus_allowed)); + return SELECT_RQ_EXIT_CPU_VALID; +} + +static int find_idlest_cpu(struct task_struct *p, int parent) +{ + unsigned long min = INT_MAX; + int min_load_cpu = 0; + unsigned long load; + int cpu; + int i; + + for (i = 0, cpu = -1; i < NR_CPUS; i++) { + cpu = libbpf_cpumask_next(cpu, (void *)getVal(p->cpus_ptr)); + if (cpu >= libbpf_nr_cpus_ids()) + break; + + load = libbpf_cfs_load_avg_of(cpu); + if (load < min) { + min = load; + min_load_cpu = cpu; + } + } + + return min_load_cpu; +} + +static int select_idle_cpu(struct task_struct *p, int parent, int prev_cpu) +{ + int cpu; + + if (libbpf_available_idle_cpu(prev_cpu)) + return prev_cpu; + + if (libbpf_available_idle_cpu(parent)) + return prev_cpu; + + libbpf_for_each_cpu_wrap(cpu, (void *)getVal(p->cpus_ptr), prev_cpu) { + if (libbpf_available_idle_cpu(cpu)) + return cpu; + } + + return prev_cpu; +} + +SEC("sched/cfs_select_rq") +int BPF_PROG(cfs_select_cpu, struct sched_migrate_ctx *h_ctx) +{ + struct task_struct *p = getVal(h_ctx->task); + int wake_flags = getVal(h_ctx->wake_flags); + int prev_cpu = getVal(h_ctx->prev_cpu); + int cpu = getVal(h_ctx->curr_cpu); + int new_cpu; + + if (wake_flags == WF_FORK) { + /* Slow path */ + new_cpu = find_idlest_cpu(p, cpu); + } else { + /* Fast path */ + new_cpu = select_idle_cpu(p, cpu, prev_cpu); + } + + return new_cpu; +} + +SEC("sched/cfs_wake_affine") +int BPF_PROG(cfs_wake_affine, struct sched_affine_ctx *h_ctx) +{ + int prev_cpu = getVal(h_ctx->prev_cpu); + int curr_cpu = getVal(h_ctx->curr_cpu); + int sync = getVal(h_ctx->is_sync); + + if (libbpf_available_idle_cpu(curr_cpu) && + libbpf_cpus_share_cache(curr_cpu, prev_cpu)) + return libbpf_available_idle_cpu(prev_cpu) ? prev_cpu : curr_cpu; + + if (sync && libbpf_nr_running_of(curr_cpu) == 1) + return curr_cpu; + + return prev_cpu; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/sched_select_core_user.c b/samples/bpf/sched_select_core_user.c new file mode 100644 index 000000000000..99c98f394478 --- /dev/null +++ b/samples/bpf/sched_select_core_user.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <unistd.h> +#include <sys/resource.h> +#include <bpf/libbpf.h> + +static void usage(void) +{ + printf("USAGE: test sched select core [...]\n"); + printf(" -W wakeup affine # Test sched wake wakeup\n"); + printf(" -C select core # Test sched select core\n"); + printf(" -R select core range # Test sched select core range\n"); + printf(" -h # Display this help\n"); +} + +#define TRACE_DIR "/sys/kernel/debug/tracing/" +#define BUF_SIZE (4096) + +/* read trace logs from debug fs */ +static void read_trace_pipe(void) +{ + int trace_fd; + + trace_fd = open(TRACE_DIR "trace_pipe", O_RDONLY, 0); + if (trace_fd < 0) + return; + + while (1) { + static char buf[BUF_SIZE]; + ssize_t sz; + + sz = read(trace_fd, buf, sizeof(buf) - 1); + if (sz > 0) { + buf[sz] = 0; + puts(buf); + } + } +} + +int main(int argc, char **argv) +{ + int opt; + char filename[256]; + char progname[4][256]; + struct bpf_object *obj; + struct bpf_program *prog[4] = {NULL}; + struct bpf_link *link[4] = {NULL}; + int prog_num = 1; + int i = 0; + + while ((opt = getopt(argc, argv, "C::R::W::E::")) != -1) { + switch (opt) { + case 'C': + snprintf(progname[0], sizeof(progname[0]), "cfs_select_cpu"); + break; + case 'R': + snprintf(progname[0], sizeof(progname[0]), "cfs_select_cpu_range"); + snprintf(progname[1], sizeof(progname[1]), "cfs_select_cpu_range_exit"); + prog_num = 2; + break; + case 'W': + snprintf(progname[0], sizeof(progname[0]), "cfs_wake_affine"); + break; + default: + usage(); + goto out; + } + } + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + goto out; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + for (i = 0; i < prog_num; i++) { + prog[i] = bpf_object__find_program_by_name(obj, progname[i]); + if (libbpf_get_error(prog[i])) { + fprintf(stderr, "ERROR: finding a prog %d in obj file failed\n", i); + goto cleanup; + } + + link[i] = bpf_program__attach(prog[i]); + if (libbpf_get_error(link[i])) { + fprintf(stderr, "ERROR: bpf_program__attach %d failed\n", i); + link[i] = NULL; + goto cleanup; + } + } + + printf("select rq BPF started, hit Ctrl+C to stop!\n"); + + read_trace_pipe(); + +cleanup: + for (; i >= 0; i--) + bpf_link__destroy(link[i]); + bpf_object__close(obj); +out: + return 0; +}