Andrii Nakryiko (1): bpf: support deferring bpf_link dealloc to after RCU grace period
Cong Wang (1): bpf: Fix a potential use-after-free in bpf_link_free()
Tengda Wu (1): Fix kabi breakage in struct bpf_link and bpf_link_ops
include/linux/bpf.h | 17 +++++++++++++++-- kernel/bpf/syscall.c | 38 ++++++++++++++++++++++++++++++++++---- kernel/trace/bpf_trace.c | 4 ++-- 3 files changed, 51 insertions(+), 8 deletions(-)
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/10658 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/A...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/10658 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/A...
From: Andrii Nakryiko andrii@kernel.org
stable inclusion from stable-v6.6.26 commit 876941f533e7b47fc69977fc4551c02f2d18af97 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I9QG81 CVE: CVE-2024-35860
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit 1a80dbcb2dbaf6e4c216e62e30fa7d3daa8001ce upstream.
BPF link for some program types is passed as a "context" which can be used by those BPF programs to look up additional information. E.g., for multi-kprobes and multi-uprobes, link is used to fetch BPF cookie values.
Because of this runtime dependency, when bpf_link refcnt drops to zero there could still be active BPF programs running accessing link data.
This patch adds generic support to defer bpf_link dealloc callback to after RCU GP, if requested. This is done by exposing two different deallocation callbacks, one synchronous and one deferred. If deferred one is provided, bpf_link_free() will schedule dealloc_deferred() callback to happen after RCU GP.
BPF is using two flavors of RCU: "classic" non-sleepable one and RCU tasks trace one. The latter is used when sleepable BPF programs are used. bpf_link_free() accommodates that by checking underlying BPF program's sleepable flag, and goes either through normal RCU GP only for non-sleepable, or through RCU tasks trace GP *and* then normal RCU GP (taking into account rcu_trace_implies_rcu_gp() optimization), if BPF program is sleepable.
We use this for multi-kprobe and multi-uprobe links, which dereference link during program run. We also preventively switch raw_tp link to use deferred dealloc callback, as upcoming changes in bpf-next tree expose raw_tp link data (specifically, cookie value) to BPF program at runtime as well.
Fixes: 0dcac2725406 ("bpf: Add multi kprobe link") Fixes: 89ae89f53d20 ("bpf: Add multi uprobe link") Reported-by: syzbot+981935d9485a560bfbcb@syzkaller.appspotmail.com Reported-by: syzbot+2cb5a6c573e98db598cc@syzkaller.appspotmail.com Reported-by: syzbot+62d8b26793e8a2bd0516@syzkaller.appspotmail.com Signed-off-by: Andrii Nakryiko andrii@kernel.org Acked-by: Jiri Olsa jolsa@kernel.org Link: https://lore.kernel.org/r/20240328052426.3042617-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov ast@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org
Conflicts: include/linux/bpf.h [Bpf related structures have changed in commit 6b6e3a2eac5f ("kabi: reserve space for bpf related structures"), causing conflicts in this patch merge] Signed-off-by: Tengda Wu wutengda2@huawei.com Signed-off-by: Pu Lehui pulehui@huawei.com --- include/linux/bpf.h | 16 +++++++++++++++- kernel/bpf/syscall.c | 35 ++++++++++++++++++++++++++++++++--- kernel/trace/bpf_trace.c | 4 ++-- 3 files changed, 49 insertions(+), 6 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3a3fa89156c3..b11095aaa684 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1582,7 +1582,13 @@ struct bpf_link { enum bpf_link_type type; const struct bpf_link_ops *ops; struct bpf_prog *prog; - struct work_struct work; + /* rcu is used before freeing, work can be used to schedule that + * RCU-based freeing before that, so they never overlap + */ + union { + struct rcu_head rcu; + struct work_struct work; + };
KABI_RESERVE(1) KABI_RESERVE(2) @@ -1592,7 +1598,15 @@ struct bpf_link {
struct bpf_link_ops { void (*release)(struct bpf_link *link); + /* deallocate link resources callback, called without RCU grace period + * waiting + */ void (*dealloc)(struct bpf_link *link); + /* deallocate link resources callback, called after RCU grace period; + * if underlying BPF program is sleepable we go through tasks trace + * RCU GP and then "classic" RCU GP + */ + void (*dealloc_deferred)(struct bpf_link *link); int (*detach)(struct bpf_link *link); int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ade28c519165..d1c9b536b012 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2873,17 +2873,46 @@ void bpf_link_inc(struct bpf_link *link) atomic64_inc(&link->refcnt); }
+static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu) +{ + struct bpf_link *link = container_of(rcu, struct bpf_link, rcu); + + /* free bpf_link and its containing memory */ + link->ops->dealloc_deferred(link); +} + +static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu) +{ + if (rcu_trace_implies_rcu_gp()) + bpf_link_defer_dealloc_rcu_gp(rcu); + else + call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp); +} + /* bpf_link_free is guaranteed to be called from process context */ static void bpf_link_free(struct bpf_link *link) { + bool sleepable = false; + bpf_link_free_id(link->id); if (link->prog) { + sleepable = link->prog->aux->sleepable; /* detach BPF program, clean up used resources */ link->ops->release(link); bpf_prog_put(link->prog); } - /* free bpf_link and its containing memory */ - link->ops->dealloc(link); + if (link->ops->dealloc_deferred) { + /* schedule BPF link deallocation; if underlying BPF program + * is sleepable, we need to first wait for RCU tasks trace + * sync, then go through "classic" RCU grace period + */ + if (sleepable) + call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp); + else + call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp); + } + if (link->ops->dealloc) + link->ops->dealloc(link); }
static void bpf_link_put_deferred(struct work_struct *work) @@ -3406,7 +3435,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
static const struct bpf_link_ops bpf_raw_tp_link_lops = { .release = bpf_raw_tp_link_release, - .dealloc = bpf_raw_tp_link_dealloc, + .dealloc_deferred = bpf_raw_tp_link_dealloc, .show_fdinfo = bpf_raw_tp_link_show_fdinfo, .fill_link_info = bpf_raw_tp_link_fill_link_info, }; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 74c994ae58f4..cc29bf49f715 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2639,7 +2639,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
static const struct bpf_link_ops bpf_kprobe_multi_link_lops = { .release = bpf_kprobe_multi_link_release, - .dealloc = bpf_kprobe_multi_link_dealloc, + .dealloc_deferred = bpf_kprobe_multi_link_dealloc, .fill_link_info = bpf_kprobe_multi_link_fill_link_info, };
@@ -3082,7 +3082,7 @@ static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link)
static const struct bpf_link_ops bpf_uprobe_multi_link_lops = { .release = bpf_uprobe_multi_link_release, - .dealloc = bpf_uprobe_multi_link_dealloc, + .dealloc_deferred = bpf_uprobe_multi_link_dealloc, };
static int uprobe_prog_run(struct bpf_uprobe *uprobe,
From: Tengda Wu wutengda2@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I9QG81 CVE: CVE-2024-35860
--------------------------------
After backport LTS commit 777a1adfea1c ("[Backport] bpf: support deferring bpf_link dealloc to after RCU grace period"), `bpf_link` and `bpf_link_ops` structures have changed, causing kabi breakage.
Use KABI_REPLACE and KABI_USE to fix kabi breakage in struct `bpf_link` and struct `bpf_link_ops`.
Fixes: 1a1260f3db15 ("bpf: support deferring bpf_link dealloc to after RCU grace period") Signed-off-by: Tengda Wu wutengda2@huawei.com Signed-off-by: Pu Lehui pulehui@huawei.com --- include/linux/bpf.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b11095aaa684..8c4c2c39a6c1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1585,10 +1585,10 @@ struct bpf_link { /* rcu is used before freeing, work can be used to schedule that * RCU-based freeing before that, so they never overlap */ - union { + KABI_REPLACE(struct work_struct work, union { struct rcu_head rcu; struct work_struct work; - }; + })
KABI_RESERVE(1) KABI_RESERVE(2) @@ -1602,11 +1602,6 @@ struct bpf_link_ops { * waiting */ void (*dealloc)(struct bpf_link *link); - /* deallocate link resources callback, called after RCU grace period; - * if underlying BPF program is sleepable we go through tasks trace - * RCU GP and then "classic" RCU GP - */ - void (*dealloc_deferred)(struct bpf_link *link); int (*detach)(struct bpf_link *link); int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog); @@ -1616,7 +1611,11 @@ struct bpf_link_ops { int (*update_map)(struct bpf_link *link, struct bpf_map *new_map, struct bpf_map *old_map);
- KABI_RESERVE(1) + /* deallocate link resources callback, called after RCU grace period; + * if underlying BPF program is sleepable we go through tasks trace + * RCU GP and then "classic" RCU GP + */ + KABI_USE(1, void (*dealloc_deferred)(struct bpf_link *link)) KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4)
From: Cong Wang cong.wang@bytedance.com
stable inclusion from stable-v6.6.35 commit 91cff53136daeff50816b0baeafd38a6976f6209 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I9QG81 CVE: CVE-2024-35860
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
[ Upstream commit 2884dc7d08d98a89d8d65121524bb7533183a63a ]
After commit 1a80dbcb2dba, bpf_link can be freed by link->ops->dealloc_deferred, but the code still tests and uses link->ops->dealloc afterward, which leads to a use-after-free as reported by syzbot. Actually, one of them should be sufficient, so just call one of them instead of both. Also add a WARN_ON() in case of any problematic implementation.
Fixes: 1a80dbcb2dba ("bpf: support deferring bpf_link dealloc to after RCU grace period") Reported-by: syzbot+1989ee16d94720836244@syzkaller.appspotmail.com Signed-off-by: Cong Wang cong.wang@bytedance.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Acked-by: Jiri Olsa jolsa@kernel.org Link: https://lore.kernel.org/bpf/20240602182703.207276-1-xiyou.wangcong@gmail.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Tengda Wu wutengda2@huawei.com Signed-off-by: Pu Lehui pulehui@huawei.com --- kernel/bpf/syscall.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d1c9b536b012..bd5b5a9adfd4 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2834,6 +2834,7 @@ static int bpf_obj_get(const union bpf_attr *attr) void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog) { + WARN_ON(ops->dealloc && ops->dealloc_deferred); atomic64_set(&link->refcnt, 1); link->type = type; link->id = 0; @@ -2892,16 +2893,17 @@ static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu) /* bpf_link_free is guaranteed to be called from process context */ static void bpf_link_free(struct bpf_link *link) { + const struct bpf_link_ops *ops = link->ops; bool sleepable = false;
bpf_link_free_id(link->id); if (link->prog) { sleepable = link->prog->aux->sleepable; /* detach BPF program, clean up used resources */ - link->ops->release(link); + ops->release(link); bpf_prog_put(link->prog); } - if (link->ops->dealloc_deferred) { + if (ops->dealloc_deferred) { /* schedule BPF link deallocation; if underlying BPF program * is sleepable, we need to first wait for RCU tasks trace * sync, then go through "classic" RCU grace period @@ -2910,9 +2912,8 @@ static void bpf_link_free(struct bpf_link *link) call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp); else call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp); - } - if (link->ops->dealloc) - link->ops->dealloc(link); + } else if (ops->dealloc) + ops->dealloc(link); }
static void bpf_link_put_deferred(struct work_struct *work)