From: Christian Brauner <brauner@kernel.org> mainline inclusion from mainline-v7.0-rc2 commit a41dbf5e004edbe1260883c43a8bd134d9cb0c1c category: bugfix bugzilla: https://atomgit.com/openeuler/kernel/issues/9218 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i... -------------------------------- Fix an oversight when creating a new mount namespace. If someone had the bright idea to make the real rootfs a shared or dependent mount and it is later copied the copy will become a peer of the old real rootfs mount or a dependent mount of it. The namespace semaphore is dropped and we use mount lock exact to lock the new real root mount. If that fails or the subsequent do_loopback() fails we rely on the copy of the real root mount to be cleaned up by path_put(). The problem is that this doesn't deal with mount propagation and will leave the mounts linked in the propagation lists. When creating a new mount namespace create_new_namespace() first acquires namespace_sem to clone the nullfs root, drops it, then reacquires it via LOCK_MOUNT_EXACT which takes inode_lock first to respect the inode_lock -> namespace_sem lock ordering. This drop-and-reacquire pattern is fragile and was the source of the propagation cleanup bug fixed in the preceding commit. Extend lock_mount_exact() with a copy_mount mode that clones the mount under the locks atomically. When copy_mount is true, path_overmounted() is skipped since we're copying the mount, not mounting on top of it - the nullfs root always has rootfs mounted on top so the check would always fail. If clone_mnt() fails after get_mountpoint() has pinned the mountpoint, __unlock_mount() is used to properly unpin the mountpoint and release both locks. This allows create_new_namespace() to use LOCK_MOUNT_EXACT_COPY which takes inode_lock and namespace_sem once and holds them throughout the clone and subsequent mount operations, eliminating the drop-and-reacquire pattern entirely. Reported-by: syzbot+a89f9434fb5a001ccd58@syzkaller.appspotmail.com Fixes: 9b8a0ba68246 ("mount: add OPEN_TREE_NAMESPACE") # mainline only Link: https://lore.kernel.org/699047f6.050a0220.2757fb.0024.GAE@google.com Signed-off-by: Christian Brauner <brauner@kernel.org> Conflicts: fs/namespace.c [Simple context conflicts, not affect this patch.] Signed-off-by: Zizhi Wo <wozizhi@huawei.com> --- fs/namespace.c | 70 ++++++++++++++++++++------------------------------ 1 file changed, 28 insertions(+), 42 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 937390f0882c..5e69a4a74c6d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2712,16 +2712,16 @@ static struct file *open_detached_copy(struct path *path, unsigned int flags) static struct mountpoint *lock_mount_exact(struct path *path); static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags) { - struct mnt_namespace *new_ns; - struct path to_path = {}; struct mnt_namespace *ns = current->nsproxy->mnt_ns; struct user_namespace *user_ns = current_user_ns(); + struct mnt_namespace *new_ns; + struct mount *new_ns_root, *old_ns_root; + struct path to_path; struct mountpoint *mp; - struct mount *new_ns_root; struct mount *mnt; unsigned int copy_flags = 0; bool locked = false; int err; @@ -2730,68 +2730,58 @@ static struct mnt_namespace *create_new_namespace(struct path *path, unsigned in new_ns = alloc_mnt_ns(user_ns, false); if (IS_ERR(new_ns)) return new_ns; - namespace_lock(); + old_ns_root = ns->root; + to_path.mnt = &old_ns_root->mnt; + to_path.dentry = old_ns_root->mnt.mnt_root; + + mp = lock_mount_exact(&to_path); + if (IS_ERR(mp)) { + err = PTR_ERR(mp); + goto err_free_ns; + } + new_ns_root = clone_mnt(ns->root, ns->root->mnt.mnt_root, copy_flags); if (IS_ERR(new_ns_root)) { - namespace_unlock(); err = PTR_ERR(new_ns_root); - goto out_free_ns; + goto err_unlock_mp; } /* * If the real rootfs had a locked mount on top of it somewhere * in the stack, lock the new mount tree as well so it can't be * exposed. */ - mnt = ns->root; + mnt = old_ns_root; while (mnt->overmount) { mnt = mnt->overmount; if (mnt->mnt.mnt_flags & MNT_LOCKED) locked = true; } - namespace_unlock(); /* - * We dropped the namespace semaphore so we can actually lock - * the copy for mounting. The copied mount isn't attached to any - * mount namespace and it is thus excluded from any propagation. - * So realistically we're isolated and the mount can't be - * overmounted. - */ - - /* Borrow the reference from clone_mnt(). */ - to_path.mnt = &new_ns_root->mnt; - to_path.dentry = dget(new_ns_root->mnt.mnt_root); - - /* Now lock for actual mounting. */ - mp = lock_mount_exact(&to_path); - if (unlikely(IS_ERR(mp))) { - err = PTR_ERR(mp); - goto out_path_put; - } - - /* - * We don't emulate unshare()ing a mount namespace. We stick to the - * restrictions of creating detached bind-mounts. It has a lot - * saner and simpler semantics. + * We don't emulate unshare()ing a mount namespace. We stick + * to the restrictions of creating detached bind-mounts. It + * has a lot saner and simpler semantics. */ mnt = __do_loopback(path, flags, copy_flags); + + lock_mount_hash(); if (IS_ERR(mnt)) { err = PTR_ERR(mnt); - unlock_mount(mp); - goto out_path_put; + umount_tree(new_ns_root, 0); + unlock_mount_hash(); + goto err_unlock_mp; } - lock_mount_hash(); if (locked) mnt->mnt.mnt_flags |= MNT_LOCKED; /* - * Now mount the detached tree on top of the copy of the - * real rootfs we created. + * now mount the detached tree on top of the copy + * of the real rootfs we created. */ attach_mnt(mnt, new_ns_root, mp); if (user_ns != ns->user_ns) lock_mnt_tree(new_ns_root); @@ -2799,18 +2789,16 @@ static struct mnt_namespace *create_new_namespace(struct path *path, unsigned in mnt_add_tree_to_ns(new_ns, new_ns_root); new_ns->root = new_ns_root; unlock_mount_hash(); unlock_mount(mp); - to_path.mnt = NULL; - path_put(&to_path); return new_ns; -out_path_put: - path_put(&to_path); -out_free_ns: +err_unlock_mp: + unlock_mount(mp); +err_free_ns: free_mnt_ns(new_ns); return ERR_PTR(err); } static struct file *open_new_namespace(struct path *path, unsigned int flags) @@ -3521,12 +3509,10 @@ static struct mountpoint *lock_mount_exact(struct path *path) inode_lock(dentry->d_inode); namespace_lock(); if (unlikely(cant_mount(dentry))) { err = -ENOENT; - } else if (path_overmounted(path)) { - err = -EBUSY; } else { mp = get_mountpoint(dentry); if (IS_ERR(mp)) err = PTR_ERR(mp); } -- 2.52.0