V4: Fix wrong adaptation in ceph_mdsc_sync().
V3: Fix wrong adaptation in remove_session_caps_cb().
V2: Modify conflicting files in the commit message.
V1: Fix main error.
Jeff Layton (4): ceph: drop private list from remove_session_caps_cb ceph: fix auth cap handling logic in remove_session_caps_cb ceph: refactor remove_session_caps_cb ceph: shut down access to inode when async create fails
Xiubo Li (1): ceph: blocklist the kclient when receiving corrupted snap trace
fs/ceph/addr.c | 24 ++++-- fs/ceph/caps.c | 138 +++++++++++++++++++++++++++++++-- fs/ceph/export.c | 12 ++- fs/ceph/file.c | 13 +++- fs/ceph/inode.c | 33 +++++++- fs/ceph/locks.c | 7 ++ fs/ceph/mds_client.c | 144 ++++++++--------------------------- fs/ceph/snap.c | 36 ++++++++- fs/ceph/super.h | 12 +++ include/linux/ceph/libceph.h | 1 + 10 files changed, 290 insertions(+), 130 deletions(-)
From: Jeff Layton jlayton@kernel.org
mainline inclusion from mainline-v5.16-rc1 commit c35cac610a24f8b2e2d6f6535b7300d3bb2e5c29 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I9R4KH CVE: CVE-2023-52732
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
This function does a lot of list-shuffling with cap flushes, all to avoid possibly freeing a slab allocation under spinlock (which is totally ok). Simplify the code by just detaching and freeing the cap flushes in place.
Signed-off-by: Jeff Layton jlayton@kernel.org Signed-off-by: Ilya Dryomov idryomov@gmail.com Signed-off-by: Zizhi Wo wozizhi@huawei.com --- fs/ceph/mds_client.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-)
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index df1ecb8bfebf..c7c6cde3faaa 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1640,7 +1640,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg; struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); - LIST_HEAD(to_remove); bool dirty_dropped = false; bool invalidate = false; int capsnap_release = 0; @@ -1659,16 +1658,17 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, mapping_set_error(&inode->i_data, -EIO); }
+ spin_lock(&mdsc->cap_dirty_lock); + + /* trash all of the cap flushes for this inode */ while (!list_empty(&ci->i_cap_flush_list)) { cf = list_first_entry(&ci->i_cap_flush_list, struct ceph_cap_flush, i_list); - list_move(&cf->i_list, &to_remove); - } - - spin_lock(&mdsc->cap_dirty_lock); - - list_for_each_entry(cf, &to_remove, i_list) list_del_init(&cf->g_list); + list_del_init(&cf->i_list); + if (!cf->is_capsnap) + ceph_free_cap_flush(cf); + }
if (!list_empty(&ci->i_dirty_item)) { pr_warn_ratelimited( @@ -1711,22 +1711,16 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, }
if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) { - list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); + cf = ci->i_prealloc_cap_flush; ci->i_prealloc_cap_flush = NULL; + if (!cf->is_capsnap) + ceph_free_cap_flush(cf); }
if (!list_empty(&ci->i_cap_snaps)) capsnap_release = remove_capsnaps(mdsc, inode); } spin_unlock(&ci->i_ceph_lock); - while (!list_empty(&to_remove)) { - struct ceph_cap_flush *cf; - cf = list_first_entry(&to_remove, - struct ceph_cap_flush, i_list); - list_del_init(&cf->i_list); - if (!cf->is_capsnap) - ceph_free_cap_flush(cf); - }
wake_up_all(&ci->i_cap_wq); if (invalidate)
From: Jeff Layton jlayton@kernel.org
mainline inclusion from mainline-v5.16-rc1 commit 3c3050267e3c9a230f23a5621d7c6bd084d15094 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I9R4KH CVE: CVE-2023-52732
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The existing logic relies on ci->i_auth_cap being NULL, but if we end up removing the auth cap early, then we'll do a lot of useless work and lock-taking on the remaining caps. Ensure that we only do the auth cap removal when we're _actually_ removing the auth cap.
Signed-off-by: Jeff Layton jlayton@kernel.org Signed-off-by: Ilya Dryomov idryomov@gmail.com
Conflicts: fs/ceph/mds_client.c [Adaptation context] Signed-off-by: Zizhi Wo wozizhi@huawei.com --- fs/ceph/mds_client.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index c7c6cde3faaa..d5f86f4e90d2 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1640,6 +1640,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg; struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); + bool is_auth; bool dirty_dropped = false; bool invalidate = false; int capsnap_release = 0; @@ -1647,8 +1648,9 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, dout("removing cap %p, ci is %p, inode is %p\n", cap, ci, &ci->vfs_inode); spin_lock(&ci->i_ceph_lock); + is_auth = (cap == ci->i_auth_cap); __ceph_remove_cap(cap, false); - if (!ci->i_auth_cap) { + if (is_auth) { struct ceph_cap_flush *cf;
if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
From: Jeff Layton jlayton@kernel.org
mainline inclusion from mainline-v5.16-rc1 commit 36e6da987e7ea839c671c950da5d3a6d175b3f0d category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I9R4KH CVE: CVE-2023-52732
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Move remove_capsnaps to caps.c. Move the part of remove_session_caps_cb under i_ceph_lock into a separate function that lives in caps.c. Have remove_session_caps_cb call the new helper after taking the lock.
Signed-off-by: Jeff Layton jlayton@kernel.org Signed-off-by: Ilya Dryomov idryomov@gmail.com
Conflicts: fs/ceph/mds_client.c [Adaptation context] Signed-off-by: Zizhi Wo wozizhi@huawei.com --- fs/ceph/caps.c | 116 +++++++++++++++++++++++++++++++++++++++++++ fs/ceph/mds_client.c | 108 ++-------------------------------------- fs/ceph/super.h | 1 + 3 files changed, 120 insertions(+), 105 deletions(-)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 8e43d07ffa8b..8f3fe7b35ba3 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4633,3 +4633,119 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, spin_unlock(&dentry->d_lock); return ret; } + +static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap_snap *capsnap; + int capsnap_release = 0; + + lockdep_assert_held(&ci->i_ceph_lock); + + dout("removing capsnaps, ci is %p, inode is %p\n", ci, inode); + + while (!list_empty(&ci->i_cap_snaps)) { + capsnap = list_first_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, ci_item); + __ceph_remove_capsnap(inode, capsnap, NULL, NULL); + ceph_put_snap_context(capsnap->context); + ceph_put_cap_snap(capsnap); + capsnap_release++; + } + wake_up_all(&ci->i_cap_wq); + wake_up_all(&mdsc->cap_flushing_wq); + return capsnap_release; +} + +int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate) +{ + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_inode_info *ci = ceph_inode(inode); + bool is_auth; + bool dirty_dropped = false; + int iputs = 0; + + lockdep_assert_held(&ci->i_ceph_lock); + + dout("removing cap %p, ci is %p, inode is %p\n", + cap, ci, &ci->vfs_inode); + + is_auth = (cap == ci->i_auth_cap); + __ceph_remove_cap(cap, false); + if (is_auth) { + struct ceph_cap_flush *cf; + + if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) { + if (inode->i_data.nrpages > 0) + *invalidate = true; + if (ci->i_wrbuffer_ref > 0) + mapping_set_error(&inode->i_data, -EIO); + } + + spin_lock(&mdsc->cap_dirty_lock); + + /* trash all of the cap flushes for this inode */ + while (!list_empty(&ci->i_cap_flush_list)) { + cf = list_first_entry(&ci->i_cap_flush_list, + struct ceph_cap_flush, i_list); + list_del_init(&cf->g_list); + list_del_init(&cf->i_list); + if (!cf->is_capsnap) + ceph_free_cap_flush(cf); + } + + if (!list_empty(&ci->i_dirty_item)) { + pr_warn_ratelimited( + " dropping dirty %s state for %p %lld\n", + ceph_cap_string(ci->i_dirty_caps), + inode, ceph_ino(inode)); + ci->i_dirty_caps = 0; + list_del_init(&ci->i_dirty_item); + dirty_dropped = true; + } + if (!list_empty(&ci->i_flushing_item)) { + pr_warn_ratelimited( + " dropping dirty+flushing %s state for %p %lld\n", + ceph_cap_string(ci->i_flushing_caps), + inode, ceph_ino(inode)); + ci->i_flushing_caps = 0; + list_del_init(&ci->i_flushing_item); + mdsc->num_cap_flushing--; + dirty_dropped = true; + } + spin_unlock(&mdsc->cap_dirty_lock); + + if (dirty_dropped) { + mapping_set_error(inode->i_mapping, -EIO); + + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; + } + } + + if (atomic_read(&ci->i_filelock_ref) > 0) { + /* make further file lock syscall return -EIO */ + ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK; + pr_warn_ratelimited(" dropping file locks for %p %lld\n", + inode, ceph_ino(inode)); + } + + if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) { + cf = ci->i_prealloc_cap_flush; + ci->i_prealloc_cap_flush = NULL; + if (!cf->is_capsnap) + ceph_free_cap_flush(cf); + } + + if (!list_empty(&ci->i_cap_snaps)) + iputs = remove_capsnaps(mdsc, inode); + } + if (dirty_dropped) + ++iputs; + return iputs; +} diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index d5f86f4e90d2..eb17aa44673f 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1611,125 +1611,23 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session, return ret; }
-static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_cap_snap *capsnap; - int capsnap_release = 0; - - lockdep_assert_held(&ci->i_ceph_lock); - - dout("removing capsnaps, ci is %p, inode is %p\n", ci, inode); - - while (!list_empty(&ci->i_cap_snaps)) { - capsnap = list_first_entry(&ci->i_cap_snaps, - struct ceph_cap_snap, ci_item); - __ceph_remove_capsnap(inode, capsnap, NULL, NULL); - ceph_put_snap_context(capsnap->context); - ceph_put_cap_snap(capsnap); - capsnap_release++; - } - wake_up_all(&ci->i_cap_wq); - wake_up_all(&mdsc->cap_flushing_wq); - return capsnap_release; -} - static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) { - struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg; - struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); - bool is_auth; - bool dirty_dropped = false; bool invalidate = false; - int capsnap_release = 0; + int iputs;
dout("removing cap %p, ci is %p, inode is %p\n", cap, ci, &ci->vfs_inode); spin_lock(&ci->i_ceph_lock); - is_auth = (cap == ci->i_auth_cap); - __ceph_remove_cap(cap, false); - if (is_auth) { - struct ceph_cap_flush *cf; - - if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { - if (inode->i_data.nrpages > 0) - invalidate = true; - if (ci->i_wrbuffer_ref > 0) - mapping_set_error(&inode->i_data, -EIO); - } - - spin_lock(&mdsc->cap_dirty_lock); - - /* trash all of the cap flushes for this inode */ - while (!list_empty(&ci->i_cap_flush_list)) { - cf = list_first_entry(&ci->i_cap_flush_list, - struct ceph_cap_flush, i_list); - list_del_init(&cf->g_list); - list_del_init(&cf->i_list); - if (!cf->is_capsnap) - ceph_free_cap_flush(cf); - } - - if (!list_empty(&ci->i_dirty_item)) { - pr_warn_ratelimited( - " dropping dirty %s state for %p %lld\n", - ceph_cap_string(ci->i_dirty_caps), - inode, ceph_ino(inode)); - ci->i_dirty_caps = 0; - list_del_init(&ci->i_dirty_item); - dirty_dropped = true; - } - if (!list_empty(&ci->i_flushing_item)) { - pr_warn_ratelimited( - " dropping dirty+flushing %s state for %p %lld\n", - ceph_cap_string(ci->i_flushing_caps), - inode, ceph_ino(inode)); - ci->i_flushing_caps = 0; - list_del_init(&ci->i_flushing_item); - mdsc->num_cap_flushing--; - dirty_dropped = true; - } - spin_unlock(&mdsc->cap_dirty_lock); - - if (dirty_dropped) { - mapping_set_error(inode->i_mapping, -EIO); - - if (ci->i_wrbuffer_ref_head == 0 && - ci->i_wr_ref == 0 && - ci->i_dirty_caps == 0 && - ci->i_flushing_caps == 0) { - ceph_put_snap_context(ci->i_head_snapc); - ci->i_head_snapc = NULL; - } - } - - if (atomic_read(&ci->i_filelock_ref) > 0) { - /* make further file lock syscall return -EIO */ - ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK; - pr_warn_ratelimited(" dropping file locks for %p %lld\n", - inode, ceph_ino(inode)); - } - - if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) { - cf = ci->i_prealloc_cap_flush; - ci->i_prealloc_cap_flush = NULL; - if (!cf->is_capsnap) - ceph_free_cap_flush(cf); - } - - if (!list_empty(&ci->i_cap_snaps)) - capsnap_release = remove_capsnaps(mdsc, inode); - } + iputs = ceph_purge_inode_cap(inode, cap, &invalidate); spin_unlock(&ci->i_ceph_lock);
wake_up_all(&ci->i_cap_wq); if (invalidate) ceph_queue_invalidate(inode); - if (dirty_dropped) - iput(inode); - while (capsnap_release--) + while (iputs--) iput(inode); return 0; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 8716cb618cbb..e0e469e261e2 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1167,6 +1167,7 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); extern int ceph_uninline_data(struct file *filp, struct page *locked_page); extern int ceph_pool_perm_check(struct inode *inode, int need); extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); +int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate);
/* file.c */ extern const struct file_operations ceph_file_fops;
From: Jeff Layton jlayton@kernel.org
mainline inclusion from mainline-v5.16-rc1 commit 5d6451b1489ad1781a0778cc876bf26a21910413 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I9R4KH CVE: CVE-2023-52732
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Add proper error handling for when an async create fails. The inode never existed, so any dirty caps or data are now toast. We already d_drop the dentry in that case, but the now-stale inode may still be around. We want to shut down access to these inodes, and ensure that they can't harbor any more dirty data, which can cause problems at umount time.
When this occurs, flag such inodes as being SHUTDOWN, and trash any caps and cap flushes that may be in flight for them, and invalidate the pagecache for the inode. Add a new helper that can check whether an inode or an entire mount is now shut down, and call it instead of accessing the mount_state directly in places where we test that now.
URL: https://tracker.ceph.com/issues/51279 Signed-off-by: Jeff Layton jlayton@kernel.org Signed-off-by: Ilya Dryomov idryomov@gmail.com
Conflicts: fs/ceph/addr.c fs/ceph/caps.c fs/ceph/inode.c fs/ceph/locks.c fs/ceph/super.c fs/ceph/super.h [Adaptation context, most of this is due to the CEPH_MOUNT_RECOVER flag not included in 5.10] Signed-off-by: Zizhi Wo wozizhi@huawei.com --- fs/ceph/addr.c | 16 +++++++++++----- fs/ceph/caps.c | 8 ++++---- fs/ceph/export.c | 12 +++++++++++- fs/ceph/file.c | 10 +++++++++- fs/ceph/inode.c | 33 +++++++++++++++++++++++++++++++-- fs/ceph/locks.c | 7 +++++++ fs/ceph/super.h | 11 +++++++++++ 7 files changed, 84 insertions(+), 13 deletions(-)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 3465ff95cb89..65ab91faeb83 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -832,7 +832,7 @@ static int ceph_writepages_start(struct address_space *mapping, wbc->sync_mode == WB_SYNC_NONE ? "NONE" : (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
- if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (ceph_inode_is_shutdown(inode)) { if (ci->i_wrbuffer_ref > 0) { pr_warn_ratelimited( "writepage_start %p %lld forced umount\n", @@ -1253,12 +1253,12 @@ static struct ceph_snap_context * ceph_find_incompatible(struct page *page) { struct inode *inode = page->mapping->host; - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode);
- if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { - dout(" page %p forced umount\n", page); - return ERR_PTR(-EIO); + if (ceph_inode_is_shutdown(inode)) { + dout(" page %p %llx:%llx is shutdown\n", page, + ceph_vinop(inode)); + return ERR_PTR(-ESTALE); }
for (;;) { @@ -1496,6 +1496,9 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) sigset_t oldset; vm_fault_t ret = VM_FAULT_SIGBUS;
+ if (ceph_inode_is_shutdown(inode)) + return ret; + ceph_block_sigs(&oldset);
dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n", @@ -1591,6 +1594,9 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) sigset_t oldset; vm_fault_t ret = VM_FAULT_SIGBUS;
+ if (ceph_inode_is_shutdown(inode)) + return ret; + prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) return VM_FAULT_OOM; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 8f3fe7b35ba3..a8761e6acabc 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2839,9 +2839,9 @@ static int try_get_cap_refs(struct inode *inode, int need, int want, goto out_unlock; }
- if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { - dout("get_cap_refs %p forced umount\n", inode); - ret = -EIO; + if (ceph_inode_is_shutdown(inode)) { + dout("get_cap_refs %p inode is shutdown\n", inode); + ret = -ESTALE; goto out_unlock; } mds_wanted = __ceph_caps_mds_wanted(ci, false); @@ -4676,7 +4676,7 @@ int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invali if (is_auth) { struct ceph_cap_flush *cf;
- if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) { + if (ceph_inode_is_shutdown(inode)) { if (inode->i_data.nrpages > 0) *invalidate = true; if (ci->i_wrbuffer_ref > 0) diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 042bb4a02c0a..952a590dd4bc 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -157,6 +157,11 @@ static struct inode *__lookup_inode(struct super_block *sb, u64 ino) ceph_mdsc_put_request(req); if (!inode) return err < 0 ? ERR_PTR(err) : ERR_PTR(-ESTALE); + } else { + if (ceph_inode_is_shutdown(inode)) { + iput(inode); + return ERR_PTR(-ESTALE); + } } return inode; } @@ -223,8 +228,13 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb, return ERR_PTR(-ESTALE);
inode = ceph_find_inode(sb, vino); - if (inode) + if (inode) { + if (ceph_inode_is_shutdown(inode)) { + iput(inode); + return ERR_PTR(-ESTALE); + } return d_obtain_alias(inode); + }
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO, USE_ANY_MDS); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d4974c652e8e..180eb466e597 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -525,6 +525,7 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
if (result) { struct dentry *dentry = req->r_dentry; + struct inode *inode = d_inode(dentry); int pathlen = 0; u64 base = 0; char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, @@ -534,7 +535,8 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc, if (!d_unhashed(dentry)) d_drop(dentry);
- /* FIXME: start returning I/O errors on all accesses? */ + ceph_inode_shutdown(inode); + pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n", base, IS_ERR(path) ? "<<bad>>" : path, result); ceph_mdsc_free_path(path, pathlen); @@ -1557,6 +1559,9 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode);
+ if (ceph_inode_is_shutdown(inode)) + return -ESTALE; + if (direct_lock) ceph_start_io_direct(inode); else @@ -1714,6 +1719,9 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) loff_t pos; loff_t limit = max(i_size_read(inode), fsc->max_file_size);
+ if (ceph_inode_is_shutdown(inode)) + return -ESTALE; + if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 36e3342d3633..c41ea2ee25fb 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1884,13 +1884,12 @@ void ceph_queue_vmtruncate(struct inode *inode) static void ceph_do_invalidate_pages(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); u32 orig_gen; int check = 0;
mutex_lock(&ci->i_truncate_mutex);
- if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (ceph_inode_is_shutdown(inode)) { pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n", inode, ceph_ino(inode)); mapping_set_error(inode->i_mapping, -EIO); @@ -2254,6 +2253,9 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS;
+ if (ceph_inode_is_shutdown(inode)) + return -ESTALE; + err = setattr_prepare(dentry, attr); if (err != 0) return err; @@ -2375,6 +2377,9 @@ int ceph_getattr(const struct path *path, struct kstat *stat, u32 valid_mask = STATX_BASIC_STATS; int err = 0;
+ if (ceph_inode_is_shutdown(inode)) + return -ESTALE; + /* Skip the getattr altogether if we're asked not to sync */ if (!(flags & AT_STATX_DONT_SYNC)) { err = ceph_do_getattr(inode, statx_to_caps(request_mask), @@ -2421,3 +2426,27 @@ int ceph_getattr(const struct path *path, struct kstat *stat, stat->result_mask = request_mask & valid_mask; return err; } + +void ceph_inode_shutdown(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct rb_node *p; + int iputs = 0; + bool invalidate = false; + + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags |= CEPH_I_SHUTDOWN; + p = rb_first(&ci->i_caps); + while (p) { + struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); + + p = rb_next(p); + iputs += ceph_purge_inode_cap(inode, cap, &invalidate); + } + spin_unlock(&ci->i_ceph_lock); + + if (invalidate) + ceph_queue_invalidate(inode); + while (iputs--) + iput(inode); +} diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 674d6ea89f71..2012d238a4ee 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -236,6 +236,10 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_POSIX)) return -ENOLCK; + + if (ceph_inode_is_shutdown(inode)) + return -ESTALE; + /* No mandatory locks */ if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK) return -ENOLCK; @@ -305,6 +309,9 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) if (fl->fl_type & LOCK_MAND) return -EOPNOTSUPP;
+ if (ceph_inode_is_shutdown(inode)) + return -ESTALE; + dout("ceph_flock, fl_file: %p\n", fl->fl_file);
spin_lock(&ci->i_ceph_lock); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index e0e469e261e2..c0c5ed3e2718 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -581,6 +581,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_ODIRECT (1 << 11) /* inode in direct I/O mode */ #define CEPH_ASYNC_CREATE_BIT (12) /* async create in flight for this */ #define CEPH_I_ASYNC_CREATE (1 << CEPH_ASYNC_CREATE_BIT) +#define CEPH_I_SHUTDOWN (1 << 13) /* inode is no longer usable */
/* * Masks of ceph inode work. @@ -999,6 +1000,16 @@ extern int __ceph_setattr(struct inode *inode, struct iattr *attr); extern int ceph_setattr(struct dentry *dentry, struct iattr *attr); extern int ceph_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); +void ceph_inode_shutdown(struct inode *inode); + +static inline bool ceph_inode_is_shutdown(struct inode *inode) +{ + unsigned long flags = READ_ONCE(ceph_inode(inode)->i_ceph_flags); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + int state = READ_ONCE(fsc->mount_state); + + return (flags & CEPH_I_SHUTDOWN) || state >= CEPH_MOUNT_SHUTDOWN; +}
/* xattr.c */ int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int);
From: Xiubo Li xiubli@redhat.com
stable inclusion from stable-v6.1.23 commit 66ec619e4591f8350f99c5269a7ce160cccc7a7c category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I9R4KH CVE: CVE-2023-52732
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
[ Upstream commit a68e564adcaa69b0930809fb64d9d5f7d9c32ba9 ]
When received corrupted snap trace we don't know what exactly has happened in MDS side. And we shouldn't continue IOs and metadatas access to MDS, which may corrupt or get incorrect contents.
This patch will just block all the further IO/MDS requests immediately and then evict the kclient itself.
The reason why we still need to evict the kclient just after blocking all the further IOs is that the MDS could revoke the caps faster.
Link: https://tracker.ceph.com/issues/57686 Signed-off-by: Xiubo Li xiubli@redhat.com Reviewed-by: Venky Shankar vshankar@redhat.com Signed-off-by: Ilya Dryomov idryomov@gmail.com Signed-off-by: Sasha Levin sashal@kernel.org
Conflicts: fs/ceph/addr.c fs/ceph/caps.c fs/ceph/mds_client.c fs/ceph/snap.c fs/ceph/super.c fs/ceph/super.h include/linux/ceph/libceph.h [Due to the large number of conflicts, a large number of adaptation patches need to be integrated, so the context adaptation is directly performed] Signed-off-by: Zizhi Wo wozizhi@huawei.com --- fs/ceph/addr.c | 8 ++++++++ fs/ceph/caps.c | 16 +++++++++++++--- fs/ceph/file.c | 3 +++ fs/ceph/mds_client.c | 32 ++++++++++++++++++++++++++++---- fs/ceph/snap.c | 36 ++++++++++++++++++++++++++++++++++-- include/linux/ceph/libceph.h | 1 + 6 files changed, 87 insertions(+), 9 deletions(-)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 65ab91faeb83..0d66e3fe7afc 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -603,6 +603,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
dout("writepage %p idx %lu\n", page, page->index);
+ if (ceph_inode_is_shutdown(inode)) + return -EIO; + /* verify this is a writeable snap context */ snapc = page_snap_context(page); if (!snapc) { @@ -1760,6 +1763,11 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) dout("uninline_data %p %llx.%llx inline_version %llu\n", inode, ceph_vinop(inode), inline_version);
+ if (ceph_inode_is_shutdown(inode)) { + err = -EIO; + goto out; + } + if (inline_version == 1 || /* initial version, no data */ inline_version == CEPH_INLINE_NONE) goto out; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index a8761e6acabc..8c1dd9e80e7e 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4109,6 +4109,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, void *p, *end; struct cap_extra_info extra_info = {}; bool queue_trunc; + bool close_sessions = false;
dout("handle_caps from mds%d\n", session->s_mds);
@@ -4249,9 +4250,13 @@ void ceph_handle_caps(struct ceph_mds_session *session, realm = NULL; if (snaptrace_len) { down_write(&mdsc->snap_rwsem); - ceph_update_snap_trace(mdsc, snaptrace, - snaptrace + snaptrace_len, - false, &realm); + if (ceph_update_snap_trace(mdsc, snaptrace, + snaptrace + snaptrace_len, + false, &realm)) { + up_write(&mdsc->snap_rwsem); + close_sessions = true; + goto done; + } downgrade_write(&mdsc->snap_rwsem); } else { down_read(&mdsc->snap_rwsem); @@ -4311,6 +4316,11 @@ void ceph_handle_caps(struct ceph_mds_session *session, ceph_put_string(extra_info.pool_ns); /* avoid calling iput_final() in mds dispatch threads */ ceph_async_iput(inode); + + /* Defer closing the sessions after s_mutex lock being released */ + if (close_sessions) + ceph_mdsc_close_sessions(mdsc); + return;
flush_cap_releases: diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 180eb466e597..215bd9a3c535 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -2004,6 +2004,9 @@ static int ceph_zero_partial_object(struct inode *inode, loff_t zero = 0; int op;
+ if (ceph_inode_is_shutdown(inode)) + return -EIO; + if (!length) { op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE; length = &zero; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index eb17aa44673f..aea1b42b9b18 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -712,6 +712,9 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, { struct ceph_mds_session *s;
+ if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) + return ERR_PTR(-EIO); + if (mds >= mdsc->mdsmap->possible_max_rank) return ERR_PTR(-EINVAL);
@@ -1397,6 +1400,9 @@ static int __open_session(struct ceph_mds_client *mdsc, int mstate; int mds = session->s_mds;
+ if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) + return -EIO; + /* wait for mds to go active? */ mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds); dout("open_session to mds%d (%s)\n", mds, @@ -2717,6 +2723,11 @@ static void __do_request(struct ceph_mds_client *mdsc, return; }
+ if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) { + dout("do_request metadata corrupted\n"); + err = -EIO; + goto finish; + } if (req->r_timeout && time_after_eq(jiffies, req->r_started + req->r_timeout)) { dout("do_request timed out\n"); @@ -3024,6 +3035,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) u64 tid; int err, result; int mds = session->s_mds; + bool close_sessions = false;
if (msg->front.iov_len < sizeof(*head)) { pr_err("mdsc_handle_reply got corrupt (short) reply\n"); @@ -3142,10 +3154,17 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) realm = NULL; if (rinfo->snapblob_len) { down_write(&mdsc->snap_rwsem); - ceph_update_snap_trace(mdsc, rinfo->snapblob, + err = ceph_update_snap_trace(mdsc, rinfo->snapblob, rinfo->snapblob + rinfo->snapblob_len, le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP, &realm); + if (err) { + up_write(&mdsc->snap_rwsem); + close_sessions = true; + if (err == -EIO) + ceph_msg_dump(msg); + goto out_err; + } downgrade_write(&mdsc->snap_rwsem); } else { down_read(&mdsc->snap_rwsem); @@ -3203,6 +3222,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) req->r_end_latency, err); out: ceph_mdsc_put_request(req); + + /* Defer closing the sessions after s_mutex lock being released */ + if (close_sessions) + ceph_mdsc_close_sessions(mdsc); return; }
@@ -4661,7 +4684,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) { u64 want_tid, want_flush;
- if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) + if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) return;
dout("sync\n"); @@ -4698,7 +4721,7 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped) }
/* - * called after sb is ro. + * called after sb is ro or when metadata corrupted. */ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) { @@ -4995,7 +5018,8 @@ static void peer_reset(struct ceph_connection *con) struct ceph_mds_client *mdsc = s->s_mdsc;
pr_warn("mds%d closed our session\n", s->s_mds); - send_mds_reconnect(mdsc, s); + if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO) + send_mds_reconnect(mdsc, s); }
static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index db464682b2cb..89cacdd71b63 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h>
+#include <linux/fs.h> #include <linux/sort.h> #include <linux/slab.h> #include <linux/iversion.h> @@ -702,8 +703,10 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm; struct ceph_snap_realm *first_realm = NULL; struct ceph_snap_realm *realm_to_rebuild = NULL; + struct ceph_client *client = mdsc->fsc->client; int rebuild_snapcs; int err = -ENOMEM; + int ret; LIST_HEAD(dirty_realms);
lockdep_assert_held_write(&mdsc->snap_rwsem); @@ -820,6 +823,27 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, if (first_realm) ceph_put_snap_realm(mdsc, first_realm); pr_err("update_snap_trace error %d\n", err); + + /* + * When receiving a corrupted snap trace we don't know what + * exactly has happened in MDS side. And we shouldn't continue + * writing to OSD, which may corrupt the snapshot contents. + * + * Just try to blocklist this kclient and then this kclient + * must be remounted to continue after the corrupted metadata + * fixed in the MDS side. + */ + WRITE_ONCE(mdsc->fsc->mount_state, CEPH_MOUNT_FENCE_IO); + ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr); + if (ret) + pr_err("%s failed to blocklist %s: %d\n", __func__, + ceph_pr_addr(&client->msgr.inst.addr), ret); + + WARN(1, "%s: %s%sdo remount to continue%s", + __func__, ret ? "" : ceph_pr_addr(&client->msgr.inst.addr), + ret ? "" : " was blocklisted, ", + err == -EIO ? " after corrupted snaptrace is fixed" : ""); + return err; }
@@ -888,6 +912,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, __le64 *split_inos = NULL, *split_realms = NULL; int i; int locked_rwsem = 0; + bool close_sessions = false;
/* decode */ if (msg->front.iov_len < sizeof(*h)) @@ -1029,8 +1054,12 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, * update using the provided snap trace. if we are deleting a * snap, we can avoid queueing cap_snaps. */ - ceph_update_snap_trace(mdsc, p, e, - op == CEPH_SNAP_OP_DESTROY, NULL); + if (ceph_update_snap_trace(mdsc, p, e, + op == CEPH_SNAP_OP_DESTROY, + NULL)) { + close_sessions = true; + goto bad; + }
if (op == CEPH_SNAP_OP_SPLIT) /* we took a reference when we created the realm, above */ @@ -1049,6 +1078,9 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, out: if (locked_rwsem) up_write(&mdsc->snap_rwsem); + + if (close_sessions) + ceph_mdsc_close_sessions(mdsc); return; }
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index c8645f0b797d..15ea07f9ccfb 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -104,6 +104,7 @@ enum { CEPH_MOUNT_UNMOUNTING, CEPH_MOUNT_UNMOUNTED, CEPH_MOUNT_SHUTDOWN, + CEPH_MOUNT_FENCE_IO, };
static inline unsigned long ceph_timeout_jiffies(unsigned long timeout)
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/8524 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/U...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/8524 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/U...