KABI for cgroup
Chen Ridong (3): Revert "cgroup: fix uaf when proc_cpuset_show" cgroup/cpuset: Prevent UAF in proc_cpuset_show() cgroup: add more reserve kabi
Waiman Long (1): cgroup: Move rcu_head up near the top of cgroup_root
Yafang Shao (1): cgroup: Make operations on the cgroup root_list RCU safe
include/linux/cgroup-defs.h | 14 +++++++++--- include/linux/memcontrol.h | 8 +++++++ kernel/cgroup/cgroup-internal.h | 3 ++- kernel/cgroup/cgroup.c | 23 ++++++++++++++------ kernel/cgroup/cpuset.c | 38 ++++++--------------------------- 5 files changed, 44 insertions(+), 42 deletions(-)
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IA9YQ9
--------------------------------
To keep the same with the mainline and backport the lts patch. This reverts commit 24c448de81d48ad08925dda9869bcf535a3258b8.
Fixes: 24c448de81d4 ("cgroup: fix uaf when proc_cpuset_show") Signed-off-by: Chen Ridong chenridong@huawei.com --- kernel/cgroup/cpuset.c | 24 ------------------------ 1 file changed, 24 deletions(-)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 2c9e50f09fc1..140dfb5ad3fc 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -5185,7 +5185,6 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, char *buf; struct cgroup_subsys_state *css; int retval; - struct cgroup *root_cgroup = NULL;
retval = -ENOMEM; buf = kmalloc(PATH_MAX, GFP_KERNEL); @@ -5193,32 +5192,9 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, goto out;
css = task_get_css(tsk, cpuset_cgrp_id); - rcu_read_lock(); - /* - * When the cpuset subsystem is mounted on the legacy hierarchy, - * the top_cpuset.css->cgroup does not hold a reference count of - * cgroup_root.cgroup. This makes accessing css->cgroup very - * dangerous because when the cpuset subsystem is remounted to the - * default hierarchy, the cgroup_root.cgroup that css->cgroup points - * to will be released, leading to a UAF issue. To avoid this problem, - * get the reference count of top_cpuset.css->cgroup first. - * - * This is ugly!! - */ - if (css == &top_cpuset.css) { - root_cgroup = css->cgroup; - if (!css_tryget_online(&root_cgroup->self)) { - rcu_read_unlock(); - retval = -EBUSY; - goto out_free; - } - } - rcu_read_unlock(); retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, current->nsproxy->cgroup_ns); css_put(css); - if (root_cgroup) - css_put(&root_cgroup->self); if (retval >= PATH_MAX) retval = -ENAMETOOLONG; if (retval < 0)
From: Yafang Shao laoar.shao@gmail.com
stable inclusion from stable-v6.6.47 commit dd9542ae7c7ca82ed2d7c185754ba9026361f6bc category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAP55A
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit d23b5c577715892c87533b13923306acc6243f93 upstream.
At present, when we perform operations on the cgroup root_list, we must hold the cgroup_mutex, which is a relatively heavyweight lock. In reality, we can make operations on this list RCU-safe, eliminating the need to hold the cgroup_mutex during traversal. Modifications to the list only occur in the cgroup root setup and destroy paths, which should be infrequent in a production environment. In contrast, traversal may occur frequently. Therefore, making it RCU-safe would be beneficial.
Signed-off-by: Yafang Shao laoar.shao@gmail.com Signed-off-by: Tejun Heo tj@kernel.org To: Michal Koutný mkoutny@suse.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Chen Ridong chenridong@huawei.com --- include/linux/cgroup-defs.h | 1 + kernel/cgroup/cgroup-internal.h | 3 ++- kernel/cgroup/cgroup.c | 23 ++++++++++++++++------- 3 files changed, 19 insertions(+), 8 deletions(-)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 6e3227a688de..05ece896af7d 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -591,6 +591,7 @@ struct cgroup_root {
/* A list running through the active hierarchies */ struct list_head root_list; + struct rcu_head rcu;
/* Hierarchy-specific flags */ unsigned int flags; diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 96a9bd2c26f0..f5fb12890645 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -170,7 +170,8 @@ extern struct list_head cgroup_roots;
/* iterate across the hierarchies */ #define for_each_root(root) \ - list_for_each_entry((root), &cgroup_roots, root_list) + list_for_each_entry_rcu((root), &cgroup_roots, root_list, \ + lockdep_is_held(&cgroup_mutex))
/** * for_each_subsys - iterate all enabled cgroup subsystems diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 52fe6ba2fefd..c26a9b3a3576 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1315,7 +1315,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
void cgroup_free_root(struct cgroup_root *root) { - kfree(root); + kfree_rcu(root, rcu); }
static void cgroup_destroy_root(struct cgroup_root *root) @@ -1348,7 +1348,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) spin_unlock_irq(&css_set_lock);
if (!list_empty(&root->root_list)) { - list_del(&root->root_list); + list_del_rcu(&root->root_list); cgroup_root_count--; }
@@ -1388,7 +1388,15 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, } }
- BUG_ON(!res_cgroup); + /* + * If cgroup_mutex is not held, the cgrp_cset_link will be freed + * before we remove the cgroup root from the root_list. Consequently, + * when accessing a cgroup root, the cset_link may have already been + * freed, resulting in a NULL res_cgroup. However, by holding the + * cgroup_mutex, we ensure that res_cgroup can't be NULL. + * If we don't hold cgroup_mutex in the caller, we must do the NULL + * check. + */ return res_cgroup; }
@@ -1447,7 +1455,6 @@ static struct cgroup *current_cgns_cgroup_dfl(void) static struct cgroup *cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) { - lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock);
return __cset_cgroup_from_root(cset, root); @@ -1455,7 +1462,9 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
/* * Return the cgroup for "task" from the given hierarchy. Must be - * called with cgroup_mutex and css_set_lock held. + * called with css_set_lock held to prevent task's groups from being modified. + * Must be called with either cgroup_mutex or rcu read lock to prevent the + * cgroup root from being destroyed. */ struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root) @@ -2030,7 +2039,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) struct cgroup_root *root = ctx->root; struct cgroup *cgrp = &root->cgrp;
- INIT_LIST_HEAD(&root->root_list); + INIT_LIST_HEAD_RCU(&root->root_list); atomic_set(&root->nr_cgrps, 1); cgrp->root = root; init_cgroup_housekeeping(cgrp); @@ -2114,7 +2123,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) * care of subsystems' refcounts, which are explicitly dropped in * the failure exit path. */ - list_add(&root->root_list, &cgroup_roots); + list_add_rcu(&root->root_list, &cgroup_roots); cgroup_root_count++;
/*
From: Waiman Long longman@redhat.com
stable inclusion from stable-v6.6.47 commit f3c60ab676bb62e01d004d5b1cf2963a296c8e6a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAP55A
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit a7fb0423c201ba12815877a0b5a68a6a1710b23a upstream.
Commit 331654dc5f40 ("cgroup: Make operations on the cgroup root_list RCU safe") adds a new rcu_head to the cgroup_root structure and kvfree_rcu() for freeing the cgroup_root.
The current implementation of kvfree_rcu(), however, has the limitation that the offset of the rcu_head structure within the larger data structure must be less than 4096 or the compilation will fail. See the macro definition of __is_kvfree_rcu_offset() in include/linux/rcupdate.h for more information.
By putting rcu_head below the large cgroup structure, any change to the cgroup structure that makes it larger run the risk of causing build failure under certain configurations. Commit 77070eeb8821 ("cgroup: Avoid false cacheline sharing of read mostly rstat_cpu") happens to be the last straw that breaks it. Fix this problem by moving the rcu_head structure up before the cgroup structure.
Fixes: 331654dc5f40 ("cgroup: Make operations on the cgroup root_list RCU safe") Reported-by: Stephen Rothwell sfr@canb.auug.org.au Closes: https://lore.kernel.org/lkml/20231207143806.114e0a74@canb.auug.org.au/ Signed-off-by: Waiman Long longman@redhat.com Acked-by: Yafang Shao laoar.shao@gmail.com Reviewed-by: Yosry Ahmed yosryahmed@google.com Reviewed-by: Michal Koutný mkoutny@suse.com Signed-off-by: Tejun Heo tj@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org
Conflicts: include/linux/cgroup-defs.h [Context is mismatched for wait_queue_head_t wait was merged] Signed-off-by: Chen Ridong chenridong@huawei.com --- include/linux/cgroup-defs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 05ece896af7d..8eb518ce87a1 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -573,6 +573,10 @@ struct cgroup_root { /* Unique id for this hierarchy. */ int hierarchy_id;
+ /* A list running through the active hierarchies */ + struct list_head root_list; + struct rcu_head rcu; /* Must be near the top */ + /* * The root cgroup. The containing cgroup_root will be destroyed on its * release. cgrp->ancestors[0] will be used overflowing into the @@ -589,10 +593,6 @@ struct cgroup_root { /* Wait while cgroups are being destroyed */ wait_queue_head_t wait;
- /* A list running through the active hierarchies */ - struct list_head root_list; - struct rcu_head rcu; - /* Hierarchy-specific flags */ unsigned int flags;
stable inclusion from stable-v6.6.44 commit 96226fbed566f3f686f53a489a29846f2d538080 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IAP55A
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
[ Upstream commit 1be59c97c83ccd67a519d8a49486b3a8a73ca28a ]
An UAF can happen when /proc/cpuset is read as reported in [1].
This can be reproduced by the following methods: 1.add an mdelay(1000) before acquiring the cgroup_lock In the cgroup_path_ns function. 2.$cat /proc/<pid>/cpuset repeatly. 3.$mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset/ $umount /sys/fs/cgroup/cpuset/ repeatly.
The race that cause this bug can be shown as below:
(umount) | (cat /proc/<pid>/cpuset) css_release | proc_cpuset_show css_release_work_fn | css = task_get_css(tsk, cpuset_cgrp_id); css_free_rwork_fn | cgroup_path_ns(css->cgroup, ...); cgroup_destroy_root | mutex_lock(&cgroup_mutex); rebind_subsystems | cgroup_free_root | | // cgrp was freed, UAF | cgroup_path_ns_locked(cgrp,..);
When the cpuset is initialized, the root node top_cpuset.css.cgrp will point to &cgrp_dfl_root.cgrp. In cgroup v1, the mount operation will allocate cgroup_root, and top_cpuset.css.cgrp will point to the allocated &cgroup_root.cgrp. When the umount operation is executed, top_cpuset.css.cgrp will be rebound to &cgrp_dfl_root.cgrp.
The problem is that when rebinding to cgrp_dfl_root, there are cases where the cgroup_root allocated by setting up the root for cgroup v1 is cached. This could lead to a Use-After-Free (UAF) if it is subsequently freed. The descendant cgroups of cgroup v1 can only be freed after the css is released. However, the css of the root will never be released, yet the cgroup_root should be freed when it is unmounted. This means that obtaining a reference to the css of the root does not guarantee that css.cgrp->root will not be freed.
Fix this problem by using rcu_read_lock in proc_cpuset_show(). As cgroup_root is kfree_rcu after commit 331654dc5f40 ("cgroup: Make operations on the cgroup root_list RCU safe"), css->cgroup won't be freed during the critical section. To call cgroup_path_ns_locked, css_set_lock is needed, so it is safe to replace task_get_css with task_css.
[1] https://syzkaller.appspot.com/bug?extid=9b1ff7be974a403aa4cd
Fixes: a79a908fd2b0 ("cgroup: introduce cgroup namespaces") Signed-off-by: Chen Ridong chenridong@huawei.com Signed-off-by: Tejun Heo tj@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org
Conflicts: kernel/cgroup/cpuset.c [commit 5715456af3e0 ("kernfs: Convert kernfs_path_from_node_locked() from strlcpy() to strscpy()") was not merged] Signed-off-by: Chen Ridong chenridong@huawei.com --- kernel/cgroup/cpuset.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 140dfb5ad3fc..f3cf9b1268e0 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -21,6 +21,7 @@ * License. See the file COPYING in the main directory of the Linux * distribution for more details. */ +#include "cgroup-internal.h"
#include <linux/cpu.h> #include <linux/cpumask.h> @@ -5191,10 +5192,14 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, if (!buf) goto out;
- css = task_get_css(tsk, cpuset_cgrp_id); - retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, - current->nsproxy->cgroup_ns); - css_put(css); + rcu_read_lock(); + spin_lock_irq(&css_set_lock); + css = task_css(tsk, cpuset_cgrp_id); + retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + spin_unlock_irq(&css_set_lock); + rcu_read_unlock(); + if (retval >= PATH_MAX) retval = -ENAMETOOLONG; if (retval < 0)
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8SA3O
--------------------------------
Reserve KABI for future feature development.
Signed-off-by: Chen Ridong chenridong@huawei.com --- include/linux/cgroup-defs.h | 7 +++++++ include/linux/memcontrol.h | 8 ++++++++ kernel/cgroup/cpuset.c | 5 ----- 3 files changed, 15 insertions(+), 5 deletions(-)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 8eb518ce87a1..f3fd0407d346 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -325,6 +325,8 @@ struct cgroup_base_stat { #ifdef CONFIG_SCHED_CORE u64 forceidle_sum; #endif + KABI_RESERVE(1) + KABI_RESERVE(2) };
/* @@ -555,6 +557,9 @@ struct cgroup { KABI_RESERVE(3) KABI_RESERVE(4) KABI_RESERVE(5) + KABI_RESERVE(6) + KABI_RESERVE(7) + KABI_RESERVE(8) /* All ancestors including self */ struct cgroup *ancestors[]; }; @@ -606,6 +611,8 @@ struct cgroup_root { KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) + KABI_RESERVE(5) + KABI_RESERVE(6) };
/* diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b2a80e089a0a..abe236201e68 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -429,6 +429,14 @@ struct mem_cgroup { KABI_RESERVE(6) KABI_RESERVE(7) KABI_RESERVE(8) + KABI_RESERVE(9) + KABI_RESERVE(10) + KABI_RESERVE(11) + KABI_RESERVE(12) + KABI_RESERVE(13) + KABI_RESERVE(14) + KABI_RESERVE(15) + KABI_RESERVE(16) struct mem_cgroup_per_node *nodeinfo[]; };
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index f3cf9b1268e0..7ea0a6d00519 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -211,11 +211,6 @@ struct cpuset {
/* Remote partition silbling list anchored at remote_children */ struct list_head remote_sibling; - - KABI_RESERVE(1) - KABI_RESERVE(2) - KABI_RESERVE(3) - KABI_RESERVE(4) };
/*
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/14221 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/P...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/14221 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/P...