[PATCH OLK-6.6 V3 4/6] mount: add OPEN_TREE_NAMESPACE

28 May 2026

From: Christian Brauner <brauner@kernel.org>

mainline inclusion
from mainline-v7.0-rc1
commit 9b8a0ba68246a61d903ce62c35c303b1501df28b
category: feature
bugzilla: https://atomgit.com/openeuler/kernel/issues/9218

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...

--------------------------------

When creating containers the setup usually involves using CLONE_NEWNS
via clone3() or unshare(). This copies the caller's complete mount
namespace. The runtime will also assemble a new rootfs and then use
pivot_root() to switch the old mount tree with the new rootfs. Afterward
it will recursively umount the old mount tree thereby getting rid of all
mounts.

On a basic system here where the mount table isn't particularly large
this still copies about 30 mounts. Copying all of these mounts only to
get rid of them later is pretty wasteful.

This is exacerbated if intermediary mount namespaces are used that only
exist for a very short amount of time and are immediately destroyed
again causing a ton of mounts to be copied and destroyed needlessly.

With a large mount table and a system where thousands or ten-thousands
of containers are spawned in parallel this quickly becomes a bottleneck
increasing contention on the semaphore.

Extend open_tree() with a new OPEN_TREE_NAMESPACE flag. Similar to
OPEN_TREE_CLONE only the indicated mount tree is copied. Instead of
returning a file descriptor referring to that mount tree
OPEN_TREE_NAMESPACE will cause open_tree() to return a file descriptor
to a new mount namespace. In that new mount namespace the copied mount
tree has been mounted on top of a copy of the real rootfs.

The caller can setns() into that mount namespace and perform any
additionally required setup such as move_mount() detached mounts in
there.

This allows OPEN_TREE_NAMESPACE to function as a combined
unshare(CLONE_NEWNS) and pivot_root().

A caller may for example choose to create an extremely minimal rootfs:

fd_mntns = open_tree(-EBADF, "/var/lib/containers/wootwoot", OPEN_TREE_NAMESPACE);

This will create a mount namespace where "wootwoot" has become the
rootfs mounted on top of the real rootfs. The caller can now setns()
into this new mount namespace and assemble additional mounts.

This also works with user namespaces:

unshare(CLONE_NEWUSER);
fd_mntns = open_tree(-EBADF, "/var/lib/containers/wootwoot", OPEN_TREE_NAMESPACE);

which creates a new mount namespace owned by the earlier created user
namespace with "wootwoot" as the rootfs mounted on top of the real
rootfs.

Link: https://patch.msgid.link/20251229-work-empty-namespace-v1-1-bfb24c7b061f@ker...
Tested-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Aleksa Sarai <cyphar@cyphar.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Suggested-by: Christian Brauner <brauner@kernel.org>
Suggested-by: Aleksa Sarai <cyphar@cyphar.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>

Conflicts:
	fs/internal.h
	fs/namespace.c
	fs/nsfs.c
[1. This kernel version does not have mnt_add_to_ns(), as commit
2eea9ce4310d ("mounts: keep list of mounts in an rbtree") not merged.
Implemented mnt_add_tree_to_ns().
 2. This kernel version does not have __ns_tree_add_raw(), as commit
885fc8ac0a4d ("nstree: make iterator generic") not merged. Not affect to
this patch.
 3. This kernel version does not have path_from_stashed(), as commit
07fd7c329839 ("libfs: add path_from_stashed()") not merged. Implemented
similar logic in open_namespace_file().
 4. This kernel version does not have LOCK_MOUNT_EXACT, as commit
9bf5d488529b ("finish_automount(): take the lock_mount() analogue into a
helper") not merged. Implemented lock_mount_exact().
 5. There are a few other minor conflicts that do not affect the patch.]
Signed-off-by: Zizhi Wo <wozizhi@huawei.com>

Signed-off-by: Zizhi Wo <wozizhi@huawei.com>
---
 fs/internal.h              |   2 +
 fs/namespace.c             | 204 ++++++++++++++++++++++++++++++++++---
 fs/nsfs.c                  |  32 ++++++
 include/uapi/linux/mount.h |   3 +-
 4 files changed, 228 insertions(+), 13 deletions(-)

diff --git a/fs/internal.h b/fs/internal.h
index 273e6fd40d1b..68f51bf7c5b0 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -15,10 +15,11 @@ struct mount;
 struct shrink_control;
 struct fs_context;
 struct pipe_inode_info;
 struct iov_iter;
 struct mnt_idmap;
+struct ns_common;
 
 /*
  * block/bdev.c
  */
 #ifdef CONFIG_BLOCK
@@ -228,10 +229,11 @@ extern void mnt_pin_kill(struct mount *m);
 
 /*
  * fs/nsfs.c
  */
 extern const struct dentry_operations ns_dentry_operations;
+struct file *open_namespace_file(struct ns_common *ns);
 
 /*
  * fs/stat.c:
  */
 
diff --git a/fs/namespace.c b/fs/namespace.c
index fbc6dd74ded4..2c9ed65371f1 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1029,10 +1029,21 @@ static struct mount *skip_mnt_tree(struct mount *p)
 		prev = p->mnt_mounts.prev;
 	}
 	return p;
 }
 
+static void mnt_add_tree_to_ns(struct mnt_namespace *ns, struct mount *root)
+{
+	struct mount *mnt;
+
+	for (mnt = root; mnt; mnt = next_mnt(mnt, root)) {
+		mnt->mnt_ns = ns;
+		ns->mounts++;
+	}
+	list_add_tail(&ns->list, &root->mnt_list);
+}
+
 /**
  * vfs_create_mount - Create a mount for a configured superblock
  * @fc: The configuration context with the superblock attached
  *
  * Create a mount to an already configured superblock.  If necessary, the
@@ -2569,27 +2580,41 @@ static int do_change_type(struct path *path, int ms_flags)
  out_unlock:
 	namespace_unlock();
 	return err;
 }
 
-static struct mount *__do_loopback(struct path *old_path, int recurse)
+static struct mount *__do_loopback(struct path *old_path,
+				   unsigned int flags, unsigned int copy_flags)
 {
 	struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);
+	bool recurse = flags & AT_RECURSIVE;
 
 	if (IS_MNT_UNBINDABLE(old))
 		return mnt;
 
 	if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
 		return mnt;
 
 	if (!recurse && has_locked_children(old, old_path->dentry))
 		return mnt;
 
+	/*
+	 * When creating a new mount namespace we don't want to copy over
+	 * mounts of mount namespaces to avoid the risk of cycles and also to
+	 * minimize the default complex interdependencies between mount
+	 * namespaces.
+	 *
+	 * We could ofc just check whether all mount namespace files aren't
+	 * creating cycles but really let's keep this simple.
+	 */
+	if (!(flags & OPEN_TREE_NAMESPACE))
+		copy_flags |= CL_COPY_MNT_NS_FILE;
+
 	if (recurse)
-		mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
+		mnt = copy_tree(old, old_path->dentry, copy_flags);
 	else
-		mnt = clone_mnt(old, old_path->dentry, 0);
+		mnt = clone_mnt(old, old_path->dentry, copy_flags);
 
 	if (!IS_ERR(mnt))
 		mnt->mnt.mnt_flags &= ~MNT_LOCKED;
 
 	return mnt;
@@ -2602,11 +2627,13 @@ static int do_loopback(struct path *path, const char *old_name,
 				int recurse)
 {
 	struct path old_path;
 	struct mount *mnt = NULL, *parent;
 	struct mountpoint *mp;
+	unsigned int flags = recurse ? AT_RECURSIVE : 0;
 	int err;
+
 	if (!old_name || !*old_name)
 		return -EINVAL;
 	err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
 	if (err)
 		return err;
@@ -2623,11 +2650,11 @@ static int do_loopback(struct path *path, const char *old_name,
 
 	parent = real_mount(path->mnt);
 	if (!check_mnt(parent))
 		goto out2;
 
-	mnt = __do_loopback(&old_path, recurse);
+	mnt = __do_loopback(&old_path, flags, 0);
 	if (IS_ERR(mnt)) {
 		err = PTR_ERR(mnt);
 		goto out2;
 	}
 
@@ -2642,22 +2669,22 @@ static int do_loopback(struct path *path, const char *old_name,
 out:
 	path_put(&old_path);
 	return err;
 }
 
-static struct file *open_detached_copy(struct path *path, bool recursive)
+static struct file *open_detached_copy(struct path *path, unsigned int flags)
 {
 	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
 	struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true);
 	struct mount *mnt, *p;
 	struct file *file;
 
 	if (IS_ERR(ns))
 		return ERR_CAST(ns);
 
 	namespace_lock();
-	mnt = __do_loopback(path, recursive);
+	mnt = __do_loopback(path, flags, 0);
 	if (IS_ERR(mnt)) {
 		namespace_unlock();
 		free_mnt_ns(ns);
 		return ERR_CAST(mnt);
 	}
@@ -2681,49 +2708,177 @@ static struct file *open_detached_copy(struct path *path, bool recursive)
 	else
 		file->f_mode |= FMODE_NEED_UNMOUNT;
 	return file;
 }
 
+static struct mountpoint *lock_mount_exact(struct path *path);
+
+static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags)
+{
+	struct mnt_namespace *new_ns;
+	struct path to_path = {};
+	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+	struct user_namespace *user_ns = current_user_ns();
+	struct mountpoint *mp;
+	struct mount *new_ns_root;
+	struct mount *mnt;
+	unsigned int copy_flags = 0;
+	bool locked = false;
+	int err;
+
+	if (user_ns != ns->user_ns)
+		copy_flags |= CL_SLAVE;
+
+	new_ns = alloc_mnt_ns(user_ns, false);
+	if (IS_ERR(new_ns))
+		return new_ns;
+
+	namespace_lock();
+	new_ns_root = clone_mnt(ns->root, ns->root->mnt.mnt_root, copy_flags);
+	if (IS_ERR(new_ns_root)) {
+		namespace_unlock();
+		err = PTR_ERR(new_ns_root);
+		goto err_free_ns;
+	}
+
+	/*
+	 * If the real rootfs had a locked mount on top of it somewhere
+	 * in the stack, lock the new mount tree as well so it can't be
+	 * exposed.
+	 */
+	mnt = ns->root;
+	while (mnt->overmount) {
+		mnt = mnt->overmount;
+		if (mnt->mnt.mnt_flags & MNT_LOCKED)
+			locked = true;
+	}
+	namespace_unlock();
+
+	/*
+	 * We dropped the namespace semaphore so we can actually lock
+	 * the copy for mounting. The copied mount isn't attached to any
+	 * mount namespace and it is thus excluded from any propagation.
+	 * So realistically we're isolated and the mount can't be
+	 * overmounted.
+	 */
+
+	/* Borrow the reference from clone_mnt(). */
+	to_path.mnt = &new_ns_root->mnt;
+	to_path.dentry = dget(new_ns_root->mnt.mnt_root);
+
+	/* Now lock for actual mounting. */
+	mp = lock_mount_exact(&to_path);
+	if (unlikely(IS_ERR(mp))) {
+		err = PTR_ERR(mp);
+		goto err_path_put;
+	}
+
+	/*
+	 * We don't emulate unshare()ing a mount namespace. We stick to the
+	 * restrictions of creating detached bind-mounts. It has a lot
+	 * saner and simpler semantics.
+	 */
+	mnt = __do_loopback(path, flags, copy_flags);
+	if (IS_ERR(mnt)) {
+		err = PTR_ERR(mnt);
+		unlock_mount(mp);
+		goto err_path_put;
+	}
+
+	lock_mount_hash();
+	if (locked)
+		mnt->mnt.mnt_flags |= MNT_LOCKED;
+	/*
+	 * Now mount the detached tree on top of the copy of the
+	 * real rootfs we created.
+	 */
+	attach_mnt(mnt, new_ns_root, mp);
+	if (user_ns != ns->user_ns)
+		lock_mnt_tree(new_ns_root);
+	unlock_mount_hash();
+
+	/* Add all mounts to the new namespace. */
+	mnt_add_tree_to_ns(new_ns, new_ns_root);
+
+	new_ns->root = new_ns_root;
+	unlock_mount(mp);
+	to_path.mnt = NULL;
+	path_put(&to_path);
+
+	return new_ns;
+
+err_path_put:
+	path_put(&to_path);
+err_free_ns:
+	free_mnt_ns(new_ns);
+	return ERR_PTR(err);
+}
+
+static struct file *open_new_namespace(struct path *path, unsigned int flags)
+{
+	struct mnt_namespace *new_ns;
+
+	new_ns = create_new_namespace(path, flags);
+	if (IS_ERR(new_ns))
+		return ERR_CAST(new_ns);
+
+	return open_namespace_file(from_mnt_ns(new_ns));
+}
+
 SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
 {
 	struct file *file;
 	struct path path;
 	int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
-	bool detached = flags & OPEN_TREE_CLONE;
 	int error;
 	int fd;
 
 	BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
 
 	if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
 		      AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
-		      OPEN_TREE_CLOEXEC))
+		      OPEN_TREE_CLOEXEC | OPEN_TREE_NAMESPACE))
+		return -EINVAL;
+
+	if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE | OPEN_TREE_NAMESPACE)) ==
+	    AT_RECURSIVE)
 		return -EINVAL;
 
-	if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
+	if (hweight32(flags & (OPEN_TREE_CLONE | OPEN_TREE_NAMESPACE)) > 1)
 		return -EINVAL;
 
 	if (flags & AT_NO_AUTOMOUNT)
 		lookup_flags &= ~LOOKUP_AUTOMOUNT;
 	if (flags & AT_SYMLINK_NOFOLLOW)
 		lookup_flags &= ~LOOKUP_FOLLOW;
 	if (flags & AT_EMPTY_PATH)
 		lookup_flags |= LOOKUP_EMPTY;
 
-	if (detached && !may_mount())
+	/*
+	 * If we create a new mount namespace with the cloned mount tree we
+	 * just care about being privileged over our current user namespace.
+	 * The new mount namespace will be owned by it.
+	 */
+	if ((flags & OPEN_TREE_NAMESPACE) &&
+	    !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if ((flags & OPEN_TREE_CLONE) && !may_mount())
 		return -EPERM;
 
 	fd = get_unused_fd_flags(flags & O_CLOEXEC);
 	if (fd < 0)
 		return fd;
 
 	error = user_path_at(dfd, filename, lookup_flags, &path);
 	if (unlikely(error)) {
 		file = ERR_PTR(error);
 	} else {
-		if (detached)
-			file = open_detached_copy(&path, flags & AT_RECURSIVE);
+		if (flags & OPEN_TREE_NAMESPACE)
+			file = open_new_namespace(&path, flags);
+		else if (flags & OPEN_TREE_CLONE)
+			file = open_detached_copy(&path, flags);
 		else
 			file = dentry_open(&path, O_PATH, current_cred());
 		path_put(&path);
 	}
 	if (IS_ERR(file)) {
@@ -3356,10 +3511,35 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
 
 	put_fs_context(fc);
 	return err;
 }
 
+static struct mountpoint *lock_mount_exact(struct path *path)
+{
+	struct dentry *dentry = path->dentry;
+	struct mountpoint *mp;
+	int err = 0;
+
+	inode_lock(dentry->d_inode);
+	namespace_lock();
+	if (unlikely(cant_mount(dentry))) {
+		err = -ENOENT;
+	} else if (path_overmounted(path)) {
+		err = -EBUSY;
+	} else {
+		mp = get_mountpoint(dentry);
+		if (IS_ERR(mp))
+			err = PTR_ERR(mp);
+	}
+	if (unlikely(err)) {
+		namespace_unlock();
+		inode_unlock(dentry->d_inode);
+		return ERR_PTR(err);
+	}
+	return mp;
+}
+
 int finish_automount(struct vfsmount *m, const struct path *path)
 {
 	struct dentry *dentry = path->dentry;
 	struct mountpoint *mp;
 	struct mount *mnt;
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 647a22433bd8..8f8c3c7c37da 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -143,10 +143,42 @@ int ns_get_path(struct path *path, struct task_struct *task,
 	};
 
 	return ns_get_path_cb(path, ns_get_path_task, &args);
 }
 
+static struct ns_common *ns_get_from_common(void *private_data)
+{
+	struct ns_common *ns = private_data;
+
+	refcount_inc(&ns->count);
+	return ns;
+}
+
+/**
+ * open_namespace_file - open a file for an existing namespace
+ * @ns: namespace to open
+ *
+ * The caller must pass a live namespace reference. This helper consumes that
+ * reference independent of success or failure. Temporary references are
+ * acquired through ns_get_path_cb() so stashed nsfs dentry lookup can retry.
+ */
+struct file *open_namespace_file(struct ns_common *ns)
+{
+	struct path path = {};
+	struct file *file;
+	int err;
+
+	err = ns_get_path_cb(&path, ns_get_from_common, ns);
+	ns->ops->put(ns);
+	if (err)
+		return ERR_PTR(err);
+
+	file = dentry_open(&path, O_RDONLY, current_cred());
+	path_put(&path);
+	return file;
+}
+
 int open_related_ns(struct ns_common *ns,
 		   struct ns_common *(*get_ns)(struct ns_common *ns))
 {
 	struct path path = {};
 	struct file *f;
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index bb242fdcfe6b..9e1fbb17d305 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -59,11 +59,12 @@
 #define MS_MGC_MSK 0xffff0000
 
 /*
  * open_tree() flags.
  */
-#define OPEN_TREE_CLONE		1		/* Clone the target tree and attach the clone */
+#define OPEN_TREE_CLONE		(1 << 0)	/* Clone the target tree and attach the clone */
+#define OPEN_TREE_NAMESPACE	(1 << 1)	/* Clone the target tree into a new mount namespace */
 #define OPEN_TREE_CLOEXEC	O_CLOEXEC	/* Close the file on execve() */
 
 /*
  * move_mount() flags.
  */
-- 
2.52.0