From: Christian Brauner <brauner@kernel.org> mainline inclusion from mainline-v7.0-rc2 commit a41dbf5e004edbe1260883c43a8bd134d9cb0c1c category: bugfix bugzilla: https://atomgit.com/openeuler/kernel/issues/9218 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i... -------------------------------- Fix an oversight when creating a new mount namespace. If someone had the bright idea to make the real rootfs a shared or dependent mount and it is later copied the copy will become a peer of the old real rootfs mount or a dependent mount of it. The namespace semaphore is dropped and we use mount lock exact to lock the new real root mount. If that fails or the subsequent do_loopback() fails we rely on the copy of the real root mount to be cleaned up by path_put(). The problem is that this doesn't deal with mount propagation and will leave the mounts linked in the propagation lists. When creating a new mount namespace create_new_namespace() first acquires namespace_sem to clone the nullfs root, drops it, then reacquires it via LOCK_MOUNT_EXACT which takes inode_lock first to respect the inode_lock -> namespace_sem lock ordering. This drop-and-reacquire pattern is fragile and was the source of the propagation cleanup bug fixed in the preceding commit. Extend lock_mount_exact() with a copy_mount mode that clones the mount under the locks atomically. When copy_mount is true, path_overmounted() is skipped since we're copying the mount, not mounting on top of it - the nullfs root always has rootfs mounted on top so the check would always fail. If clone_mnt() fails after get_mountpoint() has pinned the mountpoint, __unlock_mount() is used to properly unpin the mountpoint and release both locks. This allows create_new_namespace() to use LOCK_MOUNT_EXACT_COPY which takes inode_lock and namespace_sem once and holds them throughout the clone and subsequent mount operations, eliminating the drop-and-reacquire pattern entirely. Reported-by: syzbot+a89f9434fb5a001ccd58@syzkaller.appspotmail.com Fixes: 9b8a0ba68246 ("mount: add OPEN_TREE_NAMESPACE") # mainline only Link: https://lore.kernel.org/699047f6.050a0220.2757fb.0024.GAE@google.com Signed-off-by: Christian Brauner <brauner@kernel.org> Conflicts: fs/namespace.c [Simple context conflicts, not affect this patch.] Signed-off-by: Zizhi Wo <wozizhi@huawei.com> --- fs/namespace.c | 67 ++++++++++++++++++++------------------------------ 1 file changed, 26 insertions(+), 41 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 86b8636f8ef2..a58aa9f07641 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2737,16 +2737,16 @@ static struct file *open_detached_copy(struct path *path, unsigned int flags) static struct mountpoint *lock_mount_exact(struct path *path); static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags) { - struct mnt_namespace *new_ns; - struct path to_path = {}; struct mnt_namespace *ns = current->nsproxy->mnt_ns; struct user_namespace *user_ns = current_user_ns(); + struct mnt_namespace *new_ns; + struct mount *new_ns_root, *old_ns_root; + struct path to_path; struct mountpoint *mp; - struct mount *new_ns_root; struct mount *mnt; unsigned int copy_flags = 0; int err; if (user_ns != ns->user_ns) @@ -2754,72 +2754,59 @@ static struct mnt_namespace *create_new_namespace(struct path *path, unsigned in new_ns = alloc_mnt_ns(user_ns, false); if (IS_ERR(new_ns)) return new_ns; - namespace_lock(); - new_ns_root = clone_mnt(ns->root, ns->root->mnt.mnt_root, copy_flags); - if (IS_ERR(new_ns_root)) { - namespace_unlock(); - err = PTR_ERR(new_ns_root); - goto err_free_ns; - } - namespace_unlock(); + old_ns_root = ns->root; + to_path.mnt = &old_ns_root->mnt; + to_path.dentry = old_ns_root->mnt.mnt_root; - /* - * We dropped the namespace semaphore so we can actually lock - * the copy for mounting. The copied mount isn't attached to any - * mount namespace and it is thus excluded from any propagation. - * So realistically we're isolated and the mount can't be - * overmounted. - */ - - /* Borrow the reference from clone_mnt(). */ - to_path.mnt = &new_ns_root->mnt; - to_path.dentry = dget(new_ns_root->mnt.mnt_root); - - /* Now lock for actual mounting. */ mp = lock_mount_exact(&to_path); - if (unlikely(IS_ERR(mp))) { + if (IS_ERR(mp)) { err = PTR_ERR(mp); - goto err_path_put; + goto err_free_ns; + } + + new_ns_root = clone_mnt(ns->root, ns->root->mnt.mnt_root, copy_flags); + if (IS_ERR(new_ns_root)) { + err = PTR_ERR(new_ns_root); + goto err_unlock_mp; } /* - * We don't emulate unshare()ing a mount namespace. We stick to the - * restrictions of creating detached bind-mounts. It has a lot - * saner and simpler semantics. + * We don't emulate unshare()ing a mount namespace. We stick + * to the restrictions of creating detached bind-mounts. It + * has a lot saner and simpler semantics. */ mnt = __do_loopback(path, flags, copy_flags); + + lock_mount_hash(); if (IS_ERR(mnt)) { err = PTR_ERR(mnt); - unlock_mount(mp); - goto err_path_put; + umount_tree(new_ns_root, 0); + unlock_mount_hash(); + goto err_unlock_mp; } - lock_mount_hash(); /* - * Now mount the detached tree on top of the copy of the - * real rootfs we created. + * now mount the detached tree on top of the copy + * of the real rootfs we created. */ attach_mnt(mnt, new_ns_root, mp, false); if (user_ns != ns->user_ns) lock_mnt_tree(new_ns_root); unlock_mount_hash(); - /* Add all mounts to the new namespace. */ mnt_add_tree_to_ns(new_ns, new_ns_root); new_ns->root = new_ns_root; unlock_mount(mp); - to_path.mnt = NULL; - path_put(&to_path); return new_ns; -err_path_put: - path_put(&to_path); +err_unlock_mp: + unlock_mount(mp); err_free_ns: free_mnt_ns(new_ns); return ERR_PTR(err); } @@ -3531,12 +3518,10 @@ static struct mountpoint *lock_mount_exact(struct path *path) inode_lock(dentry->d_inode); namespace_lock(); if (unlikely(cant_mount(dentry))) { err = -ENOENT; - } else if (path_overmounted(path)) { - err = -EBUSY; } else { mp = get_mountpoint(dentry); if (IS_ERR(mp)) err = PTR_ERR(mp); } -- 2.52.0