From: Andrii Nakryiko andrii@kernel.org
mainline inclusion from mainline-v5.15-rc1 commit c7603cfa04e7c3a435b31d065f7cbdc829428f6e category: bugfix bugzilla: 182996 https://gitee.com/openeuler/kernel/issues/I4DDEL
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper") fixed the problem with cgroup-local storage use in BPF by pre-allocating per-CPU array of 8 cgroup storage pointers to accommodate possible BPF program preemptions and nested executions.
While this seems to work good in practice, it introduces new and unnecessary failure mode in which not all BPF programs might be executed if we fail to find an unused slot for cgroup storage, however unlikely it is. It might also not be so unlikely when/if we allow sleepable cgroup BPF programs in the future.
Further, the way that cgroup storage is implemented as ambiently-available property during entire BPF program execution is a convenient way to pass extra information to BPF program and helpers without requiring user code to pass around extra arguments explicitly. So it would be good to have a generic solution that can allow implementing this without arbitrary restrictions. Ideally, such solution would work for both preemptable and sleepable BPF programs in exactly the same way.
This patch introduces such solution, bpf_run_ctx. It adds one pointer field (bpf_ctx) to task_struct. This field is maintained by BPF_PROG_RUN family of macros in such a way that it always stays valid throughout BPF program execution. BPF program preemption is handled by remembering previous current->bpf_ctx value locally while executing nested BPF program and restoring old value after nested BPF program finishes. This is handled by two helper functions, bpf_set_run_ctx() and bpf_reset_run_ctx(), which are supposed to be used before and after BPF program runs, respectively.
Restoring old value of the pointer handles preemption, while bpf_run_ctx pointer being a property of current task_struct naturally solves this problem for sleepable BPF programs by "following" BPF program execution as it is scheduled in and out of CPU. It would even allow CPU migration of BPF programs, even though it's not currently allowed by BPF infra.
This patch cleans up cgroup local storage handling as a first application. The design itself is generic, though, with bpf_run_ctx being an empty struct that is supposed to be embedded into a specific struct for a given BPF program type (bpf_cg_run_ctx in this case). Follow up patches are planned that will expand this mechanism for other uses within tracing BPF programs.
To verify that this change doesn't revert the fix to the original cgroup storage issue, I ran the same repro as in the original report ([0]) and didn't get any problems. Replacing bpf_reset_run_ctx(old_run_ctx) with bpf_reset_run_ctx(NULL) triggers the issue pretty quickly (so repro does work).
[0] https://lore.kernel.org/bpf/YEEvBUiJl2pJkxTd@krava/
Fixes: b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper") Signed-off-by: Andrii Nakryiko andrii@kernel.org Signed-off-by: Daniel Borkmann daniel@iogearbox.net Acked-by: Yonghong Song yhs@fb.com Link: https://lore.kernel.org/bpf/20210712230615.3525979-1-andrii@kernel.org Conflicts: include/linux/bpf.h include/linux/sched.h kernel/fork.c net/bpf/test_run.c Signed-off-by: Pu Lehui pulehui@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com
Signed-off-by: Chen Jun chenjun102@huawei.com --- include/linux/bpf-cgroup.h | 54 --------------------------- include/linux/bpf.h | 75 ++++++++++++++++---------------------- include/linux/sched.h | 5 +++ kernel/bpf/helpers.c | 16 +++----- kernel/bpf/local_storage.c | 3 -- kernel/fork.c | 3 ++ net/bpf/test_run.c | 22 +++++------ 7 files changed, 56 insertions(+), 122 deletions(-)
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 91b966978541..5120ca319ff4 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -27,19 +27,6 @@ struct task_struct; extern struct static_key_false cgroup_bpf_enabled_key; #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
-#define BPF_CGROUP_STORAGE_NEST_MAX 8 - -struct bpf_cgroup_storage_info { - struct task_struct *task; - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; -}; - -/* For each cpu, permit maximum BPF_CGROUP_STORAGE_NEST_MAX number of tasks - * to use bpf cgroup storage simultaneously. - */ -DECLARE_PER_CPU(struct bpf_cgroup_storage_info, - bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]); - #define for_each_cgroup_storage_type(stype) \ for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++)
@@ -167,44 +154,6 @@ static inline enum bpf_cgroup_storage_type cgroup_storage_type( return BPF_CGROUP_STORAGE_SHARED; }
-static inline int bpf_cgroup_storage_set(struct bpf_cgroup_storage - *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) -{ - enum bpf_cgroup_storage_type stype; - int i, err = 0; - - preempt_disable(); - for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) { - if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != NULL)) - continue; - - this_cpu_write(bpf_cgroup_storage_info[i].task, current); - for_each_cgroup_storage_type(stype) - this_cpu_write(bpf_cgroup_storage_info[i].storage[stype], - storage[stype]); - goto out; - } - err = -EBUSY; - WARN_ON_ONCE(1); - -out: - preempt_enable(); - return err; -} - -static inline void bpf_cgroup_storage_unset(void) -{ - int i; - - for (i = BPF_CGROUP_STORAGE_NEST_MAX - 1; i >= 0; i--) { - if (likely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) - continue; - - this_cpu_write(bpf_cgroup_storage_info[i].task, NULL); - return; - } -} - struct bpf_cgroup_storage * cgroup_storage_lookup(struct bpf_cgroup_storage_map *map, void *key, bool locked); @@ -450,9 +399,6 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, return -EINVAL; }
-static inline int bpf_cgroup_storage_set( - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) { return 0; } -static inline void bpf_cgroup_storage_unset(void) {} static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map) { return 0; } static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f931fffa317b..fc83187082ed 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1078,43 +1078,20 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *include_prog, struct bpf_prog_array **new_array);
-/* For BPF_PROG_RUN_ARRAY_FLAGS and __BPF_PROG_RUN_ARRAY, - * if bpf_cgroup_storage_set() failed, the rest of programs - * will not execute. This should be a really rare scenario - * as it requires BPF_CGROUP_STORAGE_NEST_MAX number of - * preemptions all between bpf_cgroup_storage_set() and - * bpf_cgroup_storage_unset() on the same cpu. - */ -#define BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, ret_flags) \ - ({ \ - struct bpf_prog_array_item *_item; \ - struct bpf_prog *_prog; \ - struct bpf_prog_array *_array; \ - u32 _ret = 1; \ - u32 func_ret; \ - migrate_disable(); \ - rcu_read_lock(); \ - _array = rcu_dereference(array); \ - _item = &_array->items[0]; \ - while ((_prog = READ_ONCE(_item->prog))) { \ - if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage))) \ - break; \ - func_ret = func(_prog, ctx); \ - _ret &= (func_ret & 1); \ - *(ret_flags) |= (func_ret >> 1); \ - bpf_cgroup_storage_unset(); \ - _item++; \ - } \ - rcu_read_unlock(); \ - migrate_enable(); \ - _ret; \ - }) +struct bpf_run_ctx {}; + +struct bpf_cg_run_ctx { + struct bpf_run_ctx run_ctx; + struct bpf_prog_array_item *prog_item; +};
#define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null, set_cg_storage) \ ({ \ struct bpf_prog_array_item *_item; \ struct bpf_prog *_prog; \ struct bpf_prog_array *_array; \ + struct bpf_run_ctx *old_run_ctx; \ + struct bpf_cg_run_ctx run_ctx; \ u32 _ret = 1; \ migrate_disable(); \ rcu_read_lock(); \ @@ -1122,17 +1099,13 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, if (unlikely(check_non_null && !_array))\ goto _out; \ _item = &_array->items[0]; \ - while ((_prog = READ_ONCE(_item->prog))) { \ - if (!set_cg_storage) { \ - _ret &= func(_prog, ctx); \ - } else { \ - if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage))) \ - break; \ - _ret &= func(_prog, ctx); \ - bpf_cgroup_storage_unset(); \ - } \ + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);\ + while ((_prog = READ_ONCE(_item->prog))) { \ + run_ctx.prog_item = _item; \ + _ret &= func(_prog, ctx); \ _item++; \ } \ + bpf_reset_run_ctx(old_run_ctx); \ _out: \ rcu_read_unlock(); \ migrate_enable(); \ @@ -1166,6 +1139,8 @@ _out: \ struct bpf_prog_array_item *_item; \ struct bpf_prog *_prog; \ struct bpf_prog_array *_array; \ + struct bpf_run_ctx *old_run_ctx; \ + struct bpf_cg_run_ctx run_ctx; \ u32 ret; \ u32 _ret = 1; \ u32 _cn = 0; \ @@ -1173,15 +1148,15 @@ _out: \ rcu_read_lock(); \ _array = rcu_dereference(array); \ _item = &_array->items[0]; \ + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); \ while ((_prog = READ_ONCE(_item->prog))) { \ - if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage))) \ - break; \ + run_ctx.prog_item = _item; \ ret = func(_prog, ctx); \ - bpf_cgroup_storage_unset(); \ _ret &= (ret & 1); \ _cn |= (ret & 2); \ _item++; \ } \ + bpf_reset_run_ctx(old_run_ctx); \ rcu_read_unlock(); \ migrate_enable(); \ if (_ret) \ @@ -1231,6 +1206,20 @@ static inline void bpf_enable_instrumentation(void) migrate_enable(); }
+static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx) +{ + struct bpf_run_ctx *old_ctx; + + old_ctx = current->bpf_ctx; + current->bpf_ctx = new_ctx; + return old_ctx; +} + +static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx) +{ + current->bpf_ctx = old_ctx; +} + extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; extern const struct file_operations bpf_iter_fops; diff --git a/include/linux/sched.h b/include/linux/sched.h index 2fd7b89c50ef..9e9c0bd4197d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -41,6 +41,7 @@ struct audit_context; struct backing_dev_info; struct bio_list; struct blk_plug; +struct bpf_run_ctx; struct capture_control; struct cfs_rq; struct fs_struct; @@ -1341,6 +1342,10 @@ struct task_struct { /* Used by LSM modules for access restriction: */ void *security; #endif +#ifdef CONFIG_BPF_SYSCALL + /* Used for BPF run context */ + struct bpf_run_ctx *bpf_ctx; +#endif
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK unsigned long lowest_stack; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 0efe7c7bfe5e..bb4350de9f11 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -372,8 +372,6 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = { };
#ifdef CONFIG_CGROUP_BPF -DECLARE_PER_CPU(struct bpf_cgroup_storage_info, - bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) { @@ -382,17 +380,13 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) * verifier checks that its value is correct. */ enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); - struct bpf_cgroup_storage *storage = NULL; + struct bpf_cgroup_storage *storage; + struct bpf_cg_run_ctx *ctx; void *ptr; - int i;
- for (i = BPF_CGROUP_STORAGE_NEST_MAX - 1; i >= 0; i--) { - if (likely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) - continue; - - storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]); - break; - } + /* get current cgroup storage from BPF run context */ + ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); + storage = ctx->prog_item->cgroup_storage[stype];
if (stype == BPF_CGROUP_STORAGE_SHARED) ptr = &READ_ONCE(storage->buf)->data[0]; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index b139247d2dd3..49ca8771d470 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -11,9 +11,6 @@
#ifdef CONFIG_CGROUP_BPF
-DEFINE_PER_CPU(struct bpf_cgroup_storage_info, - bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]); - #include "../cgroup/cgroup-internal.h"
#define LOCAL_STORAGE_CREATE_FLAG_MASK \ diff --git a/kernel/fork.c b/kernel/fork.c index 1abdc1f6eeca..1c95cd59a635 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2099,6 +2099,9 @@ static __latent_entropy struct task_struct *copy_process( p->sequential_io = 0; p->sequential_io_avg = 0; #endif +#ifdef CONFIG_BPF_SYSCALL + p->bpf_ctx = NULL; +#endif
/* Perform scheduler related setup. Assign this task to a CPU. */ retval = sched_fork(clone_flags, p); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index eb684f31fd69..99712d35e535 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -19,18 +19,20 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *retval, u32 *time, bool xdp) { - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { NULL }; + struct bpf_prog_array_item item = {.prog = prog}; + struct bpf_run_ctx *old_ctx; + struct bpf_cg_run_ctx run_ctx; enum bpf_cgroup_storage_type stype; u64 time_start, time_spent = 0; int ret = 0; u32 i;
for_each_cgroup_storage_type(stype) { - storage[stype] = bpf_cgroup_storage_alloc(prog, stype); - if (IS_ERR(storage[stype])) { - storage[stype] = NULL; + item.cgroup_storage[stype] = bpf_cgroup_storage_alloc(prog, stype); + if (IS_ERR(item.cgroup_storage[stype])) { + item.cgroup_storage[stype] = NULL; for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); + bpf_cgroup_storage_free(item.cgroup_storage[stype]); return -ENOMEM; } } @@ -41,18 +43,15 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, rcu_read_lock(); migrate_disable(); time_start = ktime_get_ns(); + old_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); for (i = 0; i < repeat; i++) { - ret = bpf_cgroup_storage_set(storage); - if (ret) - break; + run_ctx.prog_item = &item;
if (xdp) *retval = bpf_prog_run_xdp(prog, ctx); else *retval = BPF_PROG_RUN(prog, ctx);
- bpf_cgroup_storage_unset(); - if (signal_pending(current)) { ret = -EINTR; break; @@ -70,6 +69,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, time_start = ktime_get_ns(); } } + bpf_reset_run_ctx(old_ctx); time_spent += ktime_get_ns() - time_start; migrate_enable(); rcu_read_unlock(); @@ -78,7 +78,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, *time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent;
for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); + bpf_cgroup_storage_free(item.cgroup_storage[stype]);
return ret; }