[PATCH 1/2] cgroup: saner refcounting for cgroup_root

17 Apr 2020

From: Al Viro viro@zeniv.linux.org.uk
mainline inclusion
from mainline-5.1-rc1
commit 35ac1184244f1329783e1d897f74926d8bb1103a
category: bugfix
bugzilla: 30217
CVE: NA
---------------------------
* make the reference from superblock to cgroup_root counting -
do cgroup_put() in cgroup_kill_sb() whether we'd done
percpu_ref_kill() or not; matching grab is done when we allocate
a new root.  That gives the same refcounting rules for all callers
of cgroup_do_mount() - a reference to cgroup_root has been grabbed
by caller and it either is transferred to new superblock or dropped.
* have cgroup_kill_sb() treat an already killed refcount as "just
don't bother killing it, then".
* after successful cgroup_do_mount() have cgroup1_mount() recheck
if we'd raced with mount/umount from somebody else and cgroup_root
got killed.  In that case we drop the superblock and bugger off
with -ERESTARTSYS, same as if we'd found it in the list already
dying.
* don't bother with delayed initialization of refcount - it's
unreliable and not needed.  No need to prevent attempts to bump
the refcount if we find cgroup_root of another mount in progress -
sget will reuse an existing superblock just fine and if the
other sb manages to die before we get there, we'll catch
that immediately after cgroup_do_mount().
* don't bother with kernfs_pin_sb() - no need for doing that
either.
Signed-off-by: Al Viro viro@zeniv.linux.org.uk
Signed-off-by: yangerkun yangerkun@huawei.com
Reviewed-by: Hou Tao houtao1@huawei.com
Signed-off-by: Yang Yingliang yangyingliang@huawei.com
---
 kernel/cgroup/cgroup-internal.h |  2 +-
 kernel/cgroup/cgroup-v1.c       | 58 +++++++++--------------------------------
 kernel/cgroup/cgroup.c          | 16 +++++-------
 3 files changed, 21 insertions(+), 55 deletions(-)

diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 75568fc..6f02be1 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -196,7 +196,7 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
void cgroup_free_root(struct cgroup_root *root);
 void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts);
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags);
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
 int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
 struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
    		       struct cgroup_root *root, unsigned long magic,
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index ff3e1aa..e66bb45 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -1112,13 +1112,11 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
    		     void *data, unsigned long magic,
    		     struct cgroup_namespace *ns)
 {
-	struct super_block *pinned_sb = NULL;
    struct cgroup_sb_opts opts;
    struct cgroup_root *root;
    struct cgroup_subsys *ss;
    struct dentry *dentry;
    int i, ret;
-	bool new_root = false;
cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
@@ -1180,29 +1178,6 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
    	if (root->flags ^ opts.flags)
    		pr_warn("new mount options do not match the existing superblock, will be ignored\n");
-		/*
-		 * We want to reuse @root whose lifetime is governed by its
-		 * ->cgrp.  Let's check whether @root is alive and keep it
-		 * that way.  As cgroup_kill_sb() can happen anytime, we
-		 * want to block it by pinning the sb so that @root doesn't
-		 * get killed before mount is complete.
-		 *
-		 * With the sb pinned, tryget_live can reliably indicate
-		 * whether @root can be reused.  If it's being killed,
-		 * drain it.  We can use wait_queue for the wait but this
-		 * path is super cold.  Let's just sleep a bit and retry.
-		 */
-		pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
-		if (IS_ERR(pinned_sb) ||
-		    !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
-			mutex_unlock(&cgroup_mutex);
-			if (!IS_ERR_OR_NULL(pinned_sb))
-				deactivate_super(pinned_sb);
-			msleep(10);
-			ret = restart_syscall();
-			goto out_free;
-		}
-
    	ret = 0;
    	goto out_unlock;
    }
@@ -1228,15 +1203,20 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
    	ret = -ENOMEM;
    	goto out_unlock;
    }
-	new_root = true;
init_cgroup_root(root, &opts);
-	ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD);
+	ret = cgroup_setup_root(root, opts.subsys_mask);
    if (ret)
    	cgroup_free_root(root);
out_unlock:
+	if (!ret && !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
+		mutex_unlock(&cgroup_mutex);
+		msleep(10);
+		ret = restart_syscall();
+		goto out_free;
+	}
    mutex_unlock(&cgroup_mutex);
 out_free:
    kfree(opts.release_agent);
@@ -1248,25 +1228,13 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
    dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
    			 CGROUP_SUPER_MAGIC, ns);
-	/*
-	 * There's a race window after we release cgroup_mutex and before
-	 * allocating a superblock. Make sure a concurrent process won't
-	 * be able to re-use the root during this window by delaying the
-	 * initialization of root refcnt.
-	 */
-	if (new_root) {
-		mutex_lock(&cgroup_mutex);
-		percpu_ref_reinit(&root->cgrp.self.refcnt);
-		mutex_unlock(&cgroup_mutex);
+	if (!IS_ERR(dentry) && percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
+		struct super_block *sb = dentry->d_sb;
+		dput(dentry);
+		deactivate_locked_super(sb);
+		msleep(10);
+		dentry = ERR_PTR(restart_syscall());
    }
-
-	/*
-	 * If @pinned_sb, we're reusing an existing root and holding an
-	 * extra ref on its sb.  Mount is complete.  Put the extra ref.
-	 */
-	if (pinned_sb)
-		deactivate_super(pinned_sb);
-
    return dentry;
 }
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 08bd40d..eaee21d 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1897,7 +1897,7 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
    	set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
 }
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
 {
    LIST_HEAD(tmp_links);
    struct cgroup *root_cgrp = &root->cgrp;
@@ -1914,7 +1914,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
    root_cgrp->ancestor_ids[0] = ret;
ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
-			      ref_flags, GFP_KERNEL);
+			      0, GFP_KERNEL);
    if (ret)
    	goto out;
@@ -2091,18 +2091,16 @@ static void cgroup_kill_sb(struct super_block *sb)
    struct cgroup_root *root = cgroup_root_from_kf(kf_root);
/*
-	 * If @root doesn't have any mounts or children, start killing it.
+	 * If @root doesn't have any children, start killing it.
     * This prevents new mounts by disabling percpu_ref_tryget_live().
     * cgroup_mount() may wait for @root's release.
     *
     * And don't kill the default root.
     */
-	if (!list_empty(&root->cgrp.self.children) ||
-	    root == &cgrp_dfl_root)
-		cgroup_put(&root->cgrp);
-	else
+	if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
+	    !percpu_ref_is_dying(&root->cgrp.self.refcnt))
    	percpu_ref_kill(&root->cgrp.self.refcnt);
-
+	cgroup_put(&root->cgrp);
    kernfs_kill_sb(sb);
 }
@@ -5371,7 +5369,7 @@ int __init cgroup_init(void)
    hash_add(css_set_table, &init_css_set.hlist,
    	 css_set_hash(init_css_set.subsys));
-	BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0));
+	BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
mutex_unlock(&cgroup_mutex);
-- 
1.8.3

    

2024

2023

2022

2021

2020

2019

[PATCH 1/2] cgroup: saner refcounting for cgroup_root