[PATCH OLK-6.6 V4 2/2] mount: hold namespace_sem across copy in create_new_namespace()

28 May 2026

From: Christian Brauner <brauner@kernel.org>

mainline inclusion
from mainline-v7.0-rc2
commit a41dbf5e004edbe1260883c43a8bd134d9cb0c1c
category: bugfix
bugzilla: https://atomgit.com/openeuler/kernel/issues/9218

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...

--------------------------------

Fix an oversight when creating a new mount namespace. If someone had the
bright idea to make the real rootfs a shared or dependent mount and it
is later copied the copy will become a peer of the old real rootfs mount
or a dependent mount of it. The namespace semaphore is dropped and we
use mount lock exact to lock the new real root mount. If that fails or
the subsequent do_loopback() fails we rely on the copy of the real root
mount to be cleaned up by path_put(). The problem is that this doesn't
deal with mount propagation and will leave the mounts linked in the
propagation lists.

When creating a new mount namespace create_new_namespace() first
acquires namespace_sem to clone the nullfs root, drops it, then
reacquires it via LOCK_MOUNT_EXACT which takes inode_lock first to
respect the inode_lock -> namespace_sem lock ordering. This
drop-and-reacquire pattern is fragile and was the source of the
propagation cleanup bug fixed in the preceding commit.

Extend lock_mount_exact() with a copy_mount mode that clones the mount
under the locks atomically. When copy_mount is true, path_overmounted()
is skipped since we're copying the mount, not mounting on top of it -
the nullfs root always has rootfs mounted on top so the check would
always fail. If clone_mnt() fails after get_mountpoint() has pinned the
mountpoint, __unlock_mount() is used to properly unpin the mountpoint
and release both locks.

This allows create_new_namespace() to use LOCK_MOUNT_EXACT_COPY which
takes inode_lock and namespace_sem once and holds them throughout the
clone and subsequent mount operations, eliminating the
drop-and-reacquire pattern entirely.

Reported-by: syzbot+a89f9434fb5a001ccd58@syzkaller.appspotmail.com
Fixes: 9b8a0ba68246 ("mount: add OPEN_TREE_NAMESPACE") # mainline only
Link: https://lore.kernel.org/699047f6.050a0220.2757fb.0024.GAE@google.com
Signed-off-by: Christian Brauner <brauner@kernel.org>

Conflicts:
	fs/namespace.c
[Simple context conflicts, not affect this patch.]
Signed-off-by: Zizhi Wo <wozizhi@huawei.com>
---
 fs/namespace.c | 67 ++++++++++++++++++++------------------------------
 1 file changed, 26 insertions(+), 41 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 86b8636f8ef2..a58aa9f07641 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2737,16 +2737,16 @@ static struct file *open_detached_copy(struct path *path, unsigned int flags)
 
 static struct mountpoint *lock_mount_exact(struct path *path);
 
 static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags)
 {
-	struct mnt_namespace *new_ns;
-	struct path to_path = {};
 	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
 	struct user_namespace *user_ns = current_user_ns();
+	struct mnt_namespace *new_ns;
+	struct mount *new_ns_root, *old_ns_root;
+	struct path to_path;
 	struct mountpoint *mp;
-	struct mount *new_ns_root;
 	struct mount *mnt;
 	unsigned int copy_flags = 0;
 	int err;
 
 	if (user_ns != ns->user_ns)
@@ -2754,72 +2754,59 @@ static struct mnt_namespace *create_new_namespace(struct path *path, unsigned in
 
 	new_ns = alloc_mnt_ns(user_ns, false);
 	if (IS_ERR(new_ns))
 		return new_ns;
 
-	namespace_lock();
-	new_ns_root = clone_mnt(ns->root, ns->root->mnt.mnt_root, copy_flags);
-	if (IS_ERR(new_ns_root)) {
-		namespace_unlock();
-		err = PTR_ERR(new_ns_root);
-		goto err_free_ns;
-	}
-	namespace_unlock();
+	old_ns_root = ns->root;
+	to_path.mnt = &old_ns_root->mnt;
+	to_path.dentry = old_ns_root->mnt.mnt_root;
 
-	/*
-	 * We dropped the namespace semaphore so we can actually lock
-	 * the copy for mounting. The copied mount isn't attached to any
-	 * mount namespace and it is thus excluded from any propagation.
-	 * So realistically we're isolated and the mount can't be
-	 * overmounted.
-	 */
-
-	/* Borrow the reference from clone_mnt(). */
-	to_path.mnt = &new_ns_root->mnt;
-	to_path.dentry = dget(new_ns_root->mnt.mnt_root);
-
-	/* Now lock for actual mounting. */
 	mp = lock_mount_exact(&to_path);
-	if (unlikely(IS_ERR(mp))) {
+	if (IS_ERR(mp)) {
 		err = PTR_ERR(mp);
-		goto err_path_put;
+		goto err_free_ns;
+	}
+
+	new_ns_root = clone_mnt(ns->root, ns->root->mnt.mnt_root, copy_flags);
+	if (IS_ERR(new_ns_root)) {
+		err = PTR_ERR(new_ns_root);
+		goto err_unlock_mp;
 	}
 
 	/*
-	 * We don't emulate unshare()ing a mount namespace. We stick to the
-	 * restrictions of creating detached bind-mounts. It has a lot
-	 * saner and simpler semantics.
+	 * We don't emulate unshare()ing a mount namespace. We stick
+	 * to the restrictions of creating detached bind-mounts. It
+	 * has a lot saner and simpler semantics.
 	 */
 	mnt = __do_loopback(path, flags, copy_flags);
+
+	lock_mount_hash();
 	if (IS_ERR(mnt)) {
 		err = PTR_ERR(mnt);
-		unlock_mount(mp);
-		goto err_path_put;
+		umount_tree(new_ns_root, 0);
+		unlock_mount_hash();
+		goto err_unlock_mp;
 	}
 
-	lock_mount_hash();
 	/*
-	 * Now mount the detached tree on top of the copy of the
-	 * real rootfs we created.
+	 * now mount the detached tree on top of the copy
+	 * of the real rootfs we created.
 	 */
 	attach_mnt(mnt, new_ns_root, mp, false);
 	if (user_ns != ns->user_ns)
 		lock_mnt_tree(new_ns_root);
 	unlock_mount_hash();
 
-	/* Add all mounts to the new namespace. */
 	mnt_add_tree_to_ns(new_ns, new_ns_root);
 
 	new_ns->root = new_ns_root;
 	unlock_mount(mp);
-	to_path.mnt = NULL;
-	path_put(&to_path);
 
 	return new_ns;
 
-err_path_put:
-	path_put(&to_path);
+err_unlock_mp:
+	unlock_mount(mp);
 err_free_ns:
 	free_mnt_ns(new_ns);
 	return ERR_PTR(err);
 }
 
@@ -3531,12 +3518,10 @@ static struct mountpoint *lock_mount_exact(struct path *path)
 
 	inode_lock(dentry->d_inode);
 	namespace_lock();
 	if (unlikely(cant_mount(dentry))) {
 		err = -ENOENT;
-	} else if (path_overmounted(path)) {
-		err = -EBUSY;
 	} else {
 		mp = get_mountpoint(dentry);
 		if (IS_ERR(mp))
 			err = PTR_ERR(mp);
 	}
-- 
2.52.0