Hou Tao (3): bpf: Add map and need_defer parameters to .map_fd_put_ptr() bpf: Set need_defer as false when clearing fd array during map free bpf: Defer the free of inner map when necessary
Pu Lehui (1): bpf: Fix kabi breakage in struct bpf_map and struct bpf_map_ops
include/linux/bpf.h | 18 +++++++++++++++--- kernel/bpf/arraymap.c | 33 ++++++++++++++++++++------------- kernel/bpf/hashtab.c | 6 +++--- kernel/bpf/map_in_map.c | 13 +++++++++---- kernel/bpf/map_in_map.h | 2 +- kernel/bpf/syscall.c | 23 +++++++++++++++++++++-- 6 files changed, 69 insertions(+), 26 deletions(-)
From: Hou Tao houtao1@huawei.com
mainline inclusion from mainline-v6.8-rc1 commit 20c20bd11a0702ce4dc9300c3da58acf551d9725 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I932VJ CVE: CVE-2023-52447
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
map is the pointer of outer map, and need_defer needs some explanation. need_defer tells the implementation to defer the reference release of the passed element and ensure that the element is still alive before the bpf program, which may manipulate it, exits.
The following three cases will invoke map_fd_put_ptr() and different need_defer values will be passed to these callers:
1) release the reference of the old element in the map during map update or map deletion. The release must be deferred, otherwise the bpf program may incur use-after-free problem, so need_defer needs to be true. 2) release the reference of the to-be-added element in the error path of map update. The to-be-added element is not visible to any bpf program, so it is OK to pass false for need_defer parameter. 3) release the references of all elements in the map during map release. Any bpf program which has access to the map must have been exited and released, so need_defer=false will be OK.
These two parameters will be used by the following patches to fix the potential use-after-free problem for map-in-map.
Signed-off-by: Hou Tao houtao1@huawei.com Link: https://lore.kernel.org/r/20231204140425.1480317-3-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov ast@kernel.org Signed-off-by: Pu Lehui pulehui@huawei.com --- include/linux/bpf.h | 6 +++++- kernel/bpf/arraymap.c | 12 +++++++----- kernel/bpf/hashtab.c | 6 +++--- kernel/bpf/map_in_map.c | 2 +- kernel/bpf/map_in_map.h | 2 +- 5 files changed, 17 insertions(+), 11 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index eac39fc515a3..57917246d3c5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -82,7 +82,11 @@ struct bpf_map_ops { /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, int fd); - void (*map_fd_put_ptr)(void *ptr); + /* If need_defer is true, the implementation should guarantee that + * the to-be-put element is still alive before the bpf program, which + * may manipulate it, exists. + */ + void (*map_fd_put_ptr)(struct bpf_map *map, void *ptr, bool need_defer); int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index f241bda2679d..5102338129d5 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -764,7 +764,7 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, }
if (old_ptr) - map->ops->map_fd_put_ptr(old_ptr); + map->ops->map_fd_put_ptr(map, old_ptr, true); return 0; }
@@ -787,7 +787,7 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key) }
if (old_ptr) { - map->ops->map_fd_put_ptr(old_ptr); + map->ops->map_fd_put_ptr(map, old_ptr, true); return 0; } else { return -ENOENT; @@ -811,8 +811,9 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, return prog; }
-static void prog_fd_array_put_ptr(void *ptr) +static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { + /* bpf_prog is freed after one RCU or tasks trace grace period */ bpf_prog_put(ptr); }
@@ -1139,8 +1140,9 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, return ee; }
-static void perf_event_fd_array_put_ptr(void *ptr) +static void perf_event_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { + /* bpf_perf_event is freed after one RCU grace period */ bpf_event_entry_free_rcu(ptr); }
@@ -1195,7 +1197,7 @@ static void *cgroup_fd_array_get_ptr(struct bpf_map *map, return cgroup_get_from_fd(fd); }
-static void cgroup_fd_array_put_ptr(void *ptr) +static void cgroup_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { /* cgroup_put free cgrp after a rcu grace period */ cgroup_put(ptr); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 0ce445aadfdf..ec8497314272 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -786,7 +786,7 @@ static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
if (map->ops->map_fd_put_ptr) { ptr = fd_htab_map_get_ptr(map, l); - map->ops->map_fd_put_ptr(ptr); + map->ops->map_fd_put_ptr(map, ptr, true); } }
@@ -2023,7 +2023,7 @@ static void fd_htab_map_free(struct bpf_map *map) hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { void *ptr = fd_htab_map_get_ptr(map, l);
- map->ops->map_fd_put_ptr(ptr); + map->ops->map_fd_put_ptr(map, ptr, false); } }
@@ -2064,7 +2064,7 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
ret = htab_map_update_elem(map, key, &ptr, map_flags); if (ret) - map->ops->map_fd_put_ptr(ptr); + map->ops->map_fd_put_ptr(map, ptr, false);
return ret; } diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 39ab0b68cade..0cf4cb685810 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -100,7 +100,7 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map, return inner_map; }
-void bpf_map_fd_put_ptr(void *ptr) +void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { /* ptr->ops->map_free() has to go through one * rcu grace period by itself. diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h index bcb7534afb3c..7d61602354de 100644 --- a/kernel/bpf/map_in_map.h +++ b/kernel/bpf/map_in_map.h @@ -13,7 +13,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd); void bpf_map_meta_free(struct bpf_map *map_meta); void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, int ufd); -void bpf_map_fd_put_ptr(void *ptr); +void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer); u32 bpf_map_fd_sys_lookup_elem(void *ptr);
#endif
From: Hou Tao houtao1@huawei.com
mainline inclusion from mainline-v6.8-rc1 commit 79d93b3c6ffd79abcd8e43345980aa1e904879c4 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I932VJ CVE: CVE-2023-52447
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Both map deletion operation, map release and map free operation use fd_array_map_delete_elem() to remove the element from fd array and need_defer is always true in fd_array_map_delete_elem(). For the map deletion operation and map release operation, need_defer=true is necessary, because the bpf program, which accesses the element in fd array, may still alive. However for map free operation, it is certain that the bpf program which owns the fd array has already been exited, so setting need_defer as false is appropriate for map free operation.
So fix it by adding need_defer parameter to bpf_fd_array_map_clear() and adding a new helper __fd_array_map_delete_elem() to handle the map deletion, map release and map free operations correspondingly.
Signed-off-by: Hou Tao houtao1@huawei.com Link: https://lore.kernel.org/r/20231204140425.1480317-4-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov ast@kernel.org Signed-off-by: Pu Lehui pulehui@huawei.com --- kernel/bpf/arraymap.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-)
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 5102338129d5..b5b8b57dc212 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -768,7 +768,7 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, return 0; }
-static int fd_array_map_delete_elem(struct bpf_map *map, void *key) +static int __fd_array_map_delete_elem(struct bpf_map *map, void *key, bool need_defer) { struct bpf_array *array = container_of(map, struct bpf_array, map); void *old_ptr; @@ -787,13 +787,18 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key) }
if (old_ptr) { - map->ops->map_fd_put_ptr(map, old_ptr, true); + map->ops->map_fd_put_ptr(map, old_ptr, need_defer); return 0; } else { return -ENOENT; } }
+static int fd_array_map_delete_elem(struct bpf_map *map, void *key) +{ + return __fd_array_map_delete_elem(map, key, true); +} + static void *prog_fd_array_get_ptr(struct bpf_map *map, struct file *map_file, int fd) { @@ -823,13 +828,13 @@ static u32 prog_fd_array_sys_lookup_elem(void *ptr) }
/* decrement refcnt of all bpf_progs that are stored in this map */ -static void bpf_fd_array_map_clear(struct bpf_map *map) +static void bpf_fd_array_map_clear(struct bpf_map *map, bool need_defer) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i;
for (i = 0; i < array->map.max_entries; i++) - fd_array_map_delete_elem(map, &i); + __fd_array_map_delete_elem(map, &i, need_defer); }
static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, @@ -1008,7 +1013,7 @@ static void prog_array_map_clear_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_array_aux, work)->map; - bpf_fd_array_map_clear(map); + bpf_fd_array_map_clear(map, true); bpf_map_put(map); }
@@ -1160,7 +1165,7 @@ static void perf_event_fd_array_release(struct bpf_map *map, for (i = 0; i < array->map.max_entries; i++) { ee = READ_ONCE(array->ptrs[i]); if (ee && ee->map_file == map_file) - fd_array_map_delete_elem(map, &i); + __fd_array_map_delete_elem(map, &i, true); } rcu_read_unlock(); } @@ -1168,7 +1173,7 @@ static void perf_event_fd_array_release(struct bpf_map *map, static void perf_event_fd_array_map_free(struct bpf_map *map) { if (map->map_flags & BPF_F_PRESERVE_ELEMS) - bpf_fd_array_map_clear(map); + bpf_fd_array_map_clear(map, false); fd_array_map_free(map); }
@@ -1205,7 +1210,7 @@ static void cgroup_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_de
static void cgroup_fd_array_free(struct bpf_map *map) { - bpf_fd_array_map_clear(map); + bpf_fd_array_map_clear(map, false); fd_array_map_free(map); }
@@ -1251,7 +1256,7 @@ static void array_of_map_free(struct bpf_map *map) * is protected by fdget/fdput. */ bpf_map_meta_free(map->inner_map_meta); - bpf_fd_array_map_clear(map); + bpf_fd_array_map_clear(map, false); fd_array_map_free(map); }
From: Hou Tao houtao1@huawei.com
mainline inclusion from mainline-v6.8-rc1 commit 876673364161da50eed6b472d746ef88242b2368 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I932VJ CVE: CVE-2023-52447
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
When updating or deleting an inner map in map array or map htab, the map may still be accessed by non-sleepable program or sleepable program. However bpf_map_fd_put_ptr() decreases the ref-counter of the inner map directly through bpf_map_put(), if the ref-counter is the last one (which is true for most cases), the inner map will be freed by ops->map_free() in a kworker. But for now, most .map_free() callbacks don't use synchronize_rcu() or its variants to wait for the elapse of a RCU grace period, so after the invocation of ops->map_free completes, the bpf program which is accessing the inner map may incur use-after-free problem.
Fix the free of inner map by invoking bpf_map_free_deferred() after both one RCU grace period and one tasks trace RCU grace period if the inner map has been removed from the outer map before. The deferment is accomplished by using call_rcu() or call_rcu_tasks_trace() when releasing the last ref-counter of bpf map. The newly-added rcu_head field in bpf_map shares the same storage space with work field to reduce the size of bpf_map.
Fixes: bba1dc0b55ac ("bpf: Remove redundant synchronize_rcu.") Fixes: 638e4b825d52 ("bpf: Allows per-cpu maps and map-in-map in sleepable programs") Signed-off-by: Hou Tao houtao1@huawei.com Link: https://lore.kernel.org/r/20231204140425.1480317-5-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov ast@kernel.org Conflicts: kernel/bpf/syscall.c Signed-off-by: Pu Lehui pulehui@huawei.com --- include/linux/bpf.h | 7 ++++++- kernel/bpf/map_in_map.c | 11 ++++++++--- kernel/bpf/syscall.c | 23 +++++++++++++++++++++-- 3 files changed, 35 insertions(+), 6 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 57917246d3c5..0c441d046e71 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -168,6 +168,7 @@ struct bpf_map { u32 btf_vmlinux_value_type_id; bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ + bool free_after_mult_rcu_gp; /* 22 bytes hole */
/* The 3rd and 4th cacheline with misc members to avoid false sharing @@ -175,7 +176,11 @@ struct bpf_map { */ atomic64_t refcnt ____cacheline_aligned; atomic64_t usercnt; - struct work_struct work; + /* rcu is used before freeing and work is only used during freeing */ + union { + struct work_struct work; + struct rcu_head rcu; + }; struct mutex freeze_mutex; atomic64_t writecnt; }; diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 0cf4cb685810..caa1a17cbae1 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -102,10 +102,15 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map,
void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { - /* ptr->ops->map_free() has to go through one - * rcu grace period by itself. + struct bpf_map *inner_map = ptr; + + /* The inner map may still be used by both non-sleepable and sleepable + * bpf program, so free it after one RCU grace period and one tasks + * trace RCU grace period. */ - bpf_map_put(ptr); + if (need_defer) + WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true); + bpf_map_put(inner_map); }
u32 bpf_map_fd_sys_lookup_elem(void *ptr) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a1f3a5f05e9a..ba690c210f57 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -494,6 +494,22 @@ static void bpf_map_put_uref(struct bpf_map *map) } }
+static void bpf_map_free_in_work(struct bpf_map *map) +{ + INIT_WORK(&map->work, bpf_map_free_deferred); + schedule_work(&map->work); +} + +static void bpf_map_free_rcu_gp(struct rcu_head *rcu) +{ + bpf_map_free_in_work(container_of(rcu, struct bpf_map, rcu)); +} + +static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu) +{ + call_rcu(rcu, bpf_map_free_rcu_gp); +} + /* decrement map refcnt and schedule it for freeing via workqueue * (unrelying map implementation ops->map_free() might sleep) */ @@ -503,8 +519,11 @@ static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock) /* bpf_map_free_id() must be called first */ bpf_map_free_id(map, do_idr_lock); btf_put(map->btf); - INIT_WORK(&map->work, bpf_map_free_deferred); - schedule_work(&map->work); + + if (READ_ONCE(map->free_after_mult_rcu_gp)) + call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp); + else + bpf_map_free_in_work(map); } }
hulk inclusion category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I932VJ CVE: CVE-2023-52447
--------------------------------
Fix kabi breakage in struct bpf_map and struct bpf_map_ops.
Signed-off-by: Pu Lehui pulehui@huawei.com --- include/linux/bpf.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0c441d046e71..84d4e4849e3c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -86,7 +86,8 @@ struct bpf_map_ops { * the to-be-put element is still alive before the bpf program, which * may manipulate it, exists. */ - void (*map_fd_put_ptr)(struct bpf_map *map, void *ptr, bool need_defer); + KABI_BROKEN_REPLACE(void (*map_fd_put_ptr)(void *ptr), + void (*map_fd_put_ptr)(struct bpf_map *map, void *ptr, bool need_defer)) int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, @@ -168,8 +169,8 @@ struct bpf_map { u32 btf_vmlinux_value_type_id; bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ - bool free_after_mult_rcu_gp; - /* 22 bytes hole */ + KABI_EXTEND(bool free_after_mult_rcu_gp) + /* 17 bytes hole */
/* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. @@ -177,10 +178,12 @@ struct bpf_map { atomic64_t refcnt ____cacheline_aligned; atomic64_t usercnt; /* rcu is used before freeing and work is only used during freeing */ + KABI_BROKEN_REPLACE( + struct work_struct work, union { struct work_struct work; struct rcu_head rcu; - }; + }) struct mutex freeze_mutex; atomic64_t writecnt; };
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/5343 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/M...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/5343 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/M...