Allison Henderson (1): xfs: increase rename inode reservation
ChenXiaoSong (1): xfs: fix NULL pointer dereference in xfs_getbmap()
Darrick J. Wong (6): xfs: check return codes when flushing block devices xfs: fix maxlevels comparisons in the btree staging code xfs: shut down filesystem if we xfs_trans_cancel with deferred work items xfs: don't expose internal symlink metadata buffers to the vfs xfs: fix negative array access in xfs_getbmap iomap: iomap: fix memory corruption when recording errors during writeback
Dave Chinner (13): xfs: remove xfs_blkdev_issue_flush xfs: AIL should be log centric xfs: shutdown in intent recovery has non-intent items in the AIL xfs: log shutdown triggers should only shut down the log xfs: xfs_do_force_shutdown needs to block racing shutdowns xfs: xfs_trans_commit() path must check for log shutdown xfs: shutdown during log recovery needs to mark the log shutdown xfs: remove XFS_PREALLOC_SYNC xfs: fallocate() should call file_modified() xfs: sb verifier doesn't handle uncached sb buffer xfs: write page faults in iomap are not buffered writes iomap: write iomap validity checks xfs: use iomap_valid method to detect stale cached iomaps
Gao Xiang (1): xfs: account extra freespace btree splits for multiple allocations
Guo Xuenan (3): Revert "[Huawei] xfs: fix uaf when leaf dir bestcount not match with dir data blocks" xfs: fix exception caused by unexpected illegal bestcount in leaf dir xfs: force shutdown xfs when xfs_attr_inactive fails
Long Li (4): xfs: fix ag count overflow during growfs xfs: fix hung when transaction commit fail in xfs_inactive_ifree xfs: fix a UAF when inode item push xfs: fix a UAF in xfs_iflush_abort_clean
Shida Zhang (1): xfs: trim the mapp array accordingly in xfs_da_grow_inode_int
Wu Guanghao (1): xfs: fix the problem of mount failure caused by not refreshing mp->m_sb
Ye Bin (2): xfs: fix BUG_ON in xfs_getbmap() xfs: fix dead loop when do mount with IO fault injection
Zhang Yi (2): xfs: factor out __xfs_da3_node_read() xfs: atomic drop extent entries when inactiving attr
fs/iomap/apply.c | 14 +++- fs/iomap/buffered-io.c | 29 ++++++++- fs/xfs/libxfs/xfs_alloc.c | 9 ++- fs/xfs/libxfs/xfs_alloc.h | 1 + fs/xfs/libxfs/xfs_bmap.c | 7 +- fs/xfs/libxfs/xfs_btree_staging.c | 4 +- fs/xfs/libxfs/xfs_da_btree.c | 7 +- fs/xfs/libxfs/xfs_da_btree.h | 15 ++++- fs/xfs/libxfs/xfs_dir2_leaf.c | 21 ++---- fs/xfs/libxfs/xfs_ialloc.c | 1 + fs/xfs/libxfs/xfs_sb.c | 2 +- fs/xfs/libxfs/xfs_trans_resv.c | 4 +- fs/xfs/xfs_aops.c | 2 +- fs/xfs/xfs_attr_inactive.c | 66 ++++++++++++++----- fs/xfs/xfs_bmap_util.c | 34 +++++----- fs/xfs/xfs_buf.c | 7 +- fs/xfs/xfs_buf.h | 7 +- fs/xfs/xfs_buf_item.c | 20 ++++++ fs/xfs/xfs_buf_item_recover.c | 24 +++++++ fs/xfs/xfs_file.c | 49 ++++++++------ fs/xfs/xfs_fsops.c | 9 ++- fs/xfs/xfs_inode.c | 5 +- fs/xfs/xfs_inode_item.c | 11 +++- fs/xfs/xfs_iomap.c | 105 ++++++++++++++++++++++++------ fs/xfs/xfs_iomap.h | 7 +- fs/xfs/xfs_iops.c | 34 +--------- fs/xfs/xfs_log.c | 68 +++++++++++++------ fs/xfs/xfs_log_cil.c | 4 +- fs/xfs/xfs_log_priv.h | 11 ++++ fs/xfs/xfs_log_recover.c | 56 ++++++---------- fs/xfs/xfs_mount.c | 1 + fs/xfs/xfs_pnfs.c | 12 ++-- fs/xfs/xfs_super.c | 7 -- fs/xfs/xfs_super.h | 1 - fs/xfs/xfs_symlink.c | 27 +++++--- fs/xfs/xfs_trans.c | 61 ++++++++++++----- fs/xfs/xfs_trans_ail.c | 30 ++++----- fs/xfs/xfs_trans_priv.h | 3 +- include/linux/iomap.h | 39 +++++++++-- 39 files changed, 549 insertions(+), 265 deletions(-)
From: ChenXiaoSong chenxiaosong2@huawei.com
mainline inclusion from mainline-v5.19-rc5 commit 001c179c4e26d04db8c9f5e3fef9558b58356be6 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Reproducer: 1. fallocate -l 100M image 2. mkfs.xfs -f image 3. mount image /mnt 4. setxattr("/mnt", "trusted.overlay.upper", NULL, 0, XATTR_CREATE) 5. char arg[32] = "\x01\xff\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00" "\x00\x00\x00\x00\x00\x08\x00\x00\x00\xc6\x2a\xf7"; fd = open("/mnt", O_RDONLY|O_DIRECTORY); ioctl(fd, _IOC(_IOC_READ|_IOC_WRITE, 0x58, 0x2c, 0x20), arg);
NULL pointer dereference will occur when race happens between xfs_getbmap() and xfs_bmap_set_attrforkoff():
ioctl | setxattr ----------------------------|--------------------------- xfs_getbmap | xfs_ifork_ptr | xfs_inode_has_attr_fork | ip->i_forkoff == 0 | return NULL | ifp == NULL | | xfs_bmap_set_attrforkoff | ip->i_forkoff > 0 xfs_inode_has_attr_fork | ip->i_forkoff > 0 | ifp == NULL | ifp->if_format |
Fix this by locking i_lock before xfs_ifork_ptr().
Fixes: abbf9e8a4507 ("xfs: rewrite getbmap using the xfs_iext_* helpers") Signed-off-by: ChenXiaoSong chenxiaosong2@huawei.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Darrick J. Wong djwong@kernel.org [djwong: added fixes tag] Signed-off-by: Darrick J. Wong djwong@kernel.org
conflicts: fs/xfs/xfs_bmap_util.c
Signed-off-by: Ye Bin yebin10@huawei.com Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/xfs_bmap_util.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-)
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index aaa6ffc0c923..b02b4f0a151d 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -397,29 +397,29 @@ xfs_getbmap( whichfork = XFS_COW_FORK; else whichfork = XFS_DATA_FORK; - ifp = xfs_ifork_ptr(ip, whichfork);
xfs_ilock(ip, XFS_IOLOCK_SHARED); switch (whichfork) { case XFS_ATTR_FORK: + lock = xfs_ilock_attr_map_shared(ip); if (!xfs_inode_has_attr_fork(ip)) - goto out_unlock_iolock; + goto out_unlock_ilock;
max_len = 1LL << 32; - lock = xfs_ilock_attr_map_shared(ip); break; case XFS_COW_FORK: + lock = XFS_ILOCK_SHARED; + xfs_ilock(ip, lock); + /* No CoW fork? Just return */ - if (!ifp) - goto out_unlock_iolock; + if (!xfs_ifork_ptr(ip, whichfork)) + goto out_unlock_ilock;
if (xfs_get_cowextsz_hint(ip)) max_len = mp->m_super->s_maxbytes; else max_len = XFS_ISIZE(ip);
- lock = XFS_ILOCK_SHARED; - xfs_ilock(ip, lock); break; case XFS_DATA_FORK: if (!(iflags & BMV_IF_DELALLOC) && @@ -449,6 +449,8 @@ xfs_getbmap( break; }
+ ifp = xfs_ifork_ptr(ip, whichfork); + switch (ifp->if_format) { case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE:
From: Dave Chinner dchinner@redhat.com
mainline inclusion from mainline-v5.13-rc4 commit b5071ada510a76eac0d02912bf66297b9e30ca59 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
It's a one line wrapper around blkdev_issue_flush(). Just replace it with direct calls to blkdev_issue_flush().
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Chandan Babu R chandanrlinux@gmail.com Reviewed-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Allison Henderson allison.henderson@oracle.com Signed-off-by: Darrick J. Wong djwong@kernel.org
conflicts: fs/xfs/xfs_log.c fs/xfs/xfs_super.c fs/xfs/xfs_super.h
Signed-off-by: Ye Bin yebin10@huawei.com Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/xfs_buf.c | 2 +- fs/xfs/xfs_file.c | 6 +++--- fs/xfs/xfs_log.c | 3 ++- fs/xfs/xfs_super.c | 7 ------- fs/xfs/xfs_super.h | 1 - 5 files changed, 6 insertions(+), 13 deletions(-)
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 77c9eebb9c9e..73adabb98aa2 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1970,7 +1970,7 @@ xfs_free_buftarg( percpu_counter_destroy(&btp->bt_io_count); list_lru_destroy(&btp->bt_lru);
- xfs_blkdev_issue_flush(btp); + blkdev_issue_flush(btp->bt_bdev, GFP_NOFS); invalidate_bdev(btp->bt_bdev);
kmem_free(btp); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 0d0a282c7fc5..80ba4666891e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -197,9 +197,9 @@ xfs_file_fsync( * inode size in case of an extending write. */ if (XFS_IS_REALTIME_INODE(ip)) - xfs_blkdev_issue_flush(mp->m_rtdev_targp); + blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev, GFP_NOFS); else if (mp->m_logdev_targp != mp->m_ddev_targp) - xfs_blkdev_issue_flush(mp->m_ddev_targp); + blkdev_issue_flush(mp->m_ddev_targp->bt_bdev, GFP_NOFS);
/* * Any inode that has dirty modifications in the log is pinned. The @@ -219,7 +219,7 @@ xfs_file_fsync( */ if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) && mp->m_logdev_targp == mp->m_ddev_targp) - xfs_blkdev_issue_flush(mp->m_ddev_targp); + blkdev_issue_flush(mp->m_ddev_targp->bt_bdev, GFP_NOFS);
return error; } diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index e6a824bed12a..d27d7e9a3478 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1809,7 +1809,8 @@ xlog_write_iclog( * but it *must* complete before we issue the external log IO. */ if (log->l_targ != log->l_mp->m_ddev_targp) - xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp); + blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev, + GFP_NOFS); } if (iclog->ic_flags & XLOG_ICL_NEED_FUA) iclog->ic_bio.bi_opf |= REQ_FUA; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index c4500db05a86..e83af9027131 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -355,13 +355,6 @@ xfs_blkdev_put( blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); }
-void -xfs_blkdev_issue_flush( - xfs_buftarg_t *buftarg) -{ - blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS); -} - STATIC void xfs_close_devices( struct xfs_mount *mp) diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 1ca484b8357f..79cb2dece811 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -88,7 +88,6 @@ struct block_device;
extern void xfs_quiesce_attr(struct xfs_mount *mp); extern void xfs_flush_inodes(struct xfs_mount *mp); -extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *, xfs_agnumber_t agcount);
From: "Darrick J. Wong" djwong@kernel.org
mainline inclusion from mainline-v5.19-rc5 commit 7d839e325af221ff69d52e15c112cf09da91d149 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
If a blkdev_issue_flush fails, fsync needs to report that to upper levels. Modify xfs_file_fsync to capture the errors, while trying to flush as much data and log updates to disk as possible.
If log writes cannot flush the data device, we need to shut down the log immediately because we've violated a log invariant. Modify this code to check the return value of blkdev_issue_flush as well.
This behavior seems to go back to about 2.6.15 or so, which makes this fixes tag a bit misleading.
Link: https://elixir.bootlin.com/linux/v2.6.15/source/fs/xfs/xfs_vnodeops.c#L1187 Fixes: b5071ada510a ("xfs: remove xfs_blkdev_issue_flush") Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com
conflicts: fs/xfs/xfs_file.c fs/xfs/xfs_log.c
Signed-off-by: Ye Bin yebin@huaweicloud.com Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/xfs_file.c | 23 ++++++++++++++--------- fs/xfs/xfs_log.c | 14 +++++++++++--- 2 files changed, 25 insertions(+), 12 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 80ba4666891e..62057d85895d 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -176,7 +176,7 @@ xfs_file_fsync( { struct xfs_inode *ip = XFS_I(file->f_mapping->host); struct xfs_mount *mp = ip->i_mount; - int error = 0; + int error, err2; int log_flushed = 0;
trace_xfs_file_fsync(ip); @@ -197,18 +197,21 @@ xfs_file_fsync( * inode size in case of an extending write. */ if (XFS_IS_REALTIME_INODE(ip)) - blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev, GFP_NOFS); + error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev, GFP_NOFS); else if (mp->m_logdev_targp != mp->m_ddev_targp) - blkdev_issue_flush(mp->m_ddev_targp->bt_bdev, GFP_NOFS); + error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev, GFP_NOFS);
/* * Any inode that has dirty modifications in the log is pinned. The - * racy check here for a pinned inode while not catch modifications + * racy check here for a pinned inode will not catch modifications * that happen concurrently to the fsync call, but fsync semantics * only require to sync previously completed I/O. */ - if (xfs_ipincount(ip)) - error = xfs_fsync_flush_log(ip, datasync, &log_flushed); + if (xfs_ipincount(ip)) { + err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed); + if (err2 && !error) + error = err2; + }
/* * If we only have a single device, and the log force about was @@ -218,9 +221,11 @@ xfs_file_fsync( * commit. */ if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) && - mp->m_logdev_targp == mp->m_ddev_targp) - blkdev_issue_flush(mp->m_ddev_targp->bt_bdev, GFP_NOFS); - + mp->m_logdev_targp == mp->m_ddev_targp) { + err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev, GFP_NOFS); + if (err2 && !error) + error = err2; + } return error; }
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index d27d7e9a3478..d3b4d95c62fc 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1807,10 +1807,18 @@ xlog_write_iclog( * device cache first to ensure all metadata writeback covered * by the LSN in this iclog is on stable storage. This is slow, * but it *must* complete before we issue the external log IO. + * + * If the flush fails, we cannot conclude that past metadata + * writeback from the log succeeded. Repeating the flush is + * not possible, hence we must shut down with log IO error to + * avoid shutdown re-entering this path and erroring out again. */ - if (log->l_targ != log->l_mp->m_ddev_targp) - blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev, - GFP_NOFS); + if (log->l_targ != log->l_mp->m_ddev_targp && + blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev, + GFP_NOFS)) { + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); + return; + } } if (iclog->ic_flags & XLOG_ICL_NEED_FUA) iclog->ic_bio.bi_opf |= REQ_FUA;
From: Shida Zhang zhangshida@kylinos.cn
mainline inclusion from mainline-v6.0-rc6 commit 44159659df8ca381b84261e11058b2176fa03ba0 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Take a look at the for-loop in xfs_da_grow_inode_int: ====== for(){ nmap = min(XFS_BMAP_MAX_NMAP, count); ... error = xfs_bmapi_write(...,&mapp[mapi], &nmap);//(..., $1, $2) ... mapi += nmap; } ===== where $1 stands for the start address of the array, while $2 is used to indicate the size of the array.
The array $1 will advance by $nmap in each iteration after the allocation of extents. But the size $2 still remains unchanged, which is determined by min(XFS_BMAP_MAX_NMAP, count).
It seems that it has forgotten to trim the mapp array after each iteration, so change it.
Signed-off-by: Shida Zhang zhangshida@kylinos.cn Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Dave Chinner david@fromorbit.com Signed-off-by: Ye Bin yebin10@huawei.com Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/libxfs/xfs_da_btree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index b1f4014f9bcc..66b646269d29 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -2180,8 +2180,8 @@ xfs_da_grow_inode_int( */ mapp = kmem_alloc(sizeof(*mapp) * count, 0); for (b = *bno, mapi = 0; b < *bno + count; ) { - nmap = min(XFS_BMAP_MAX_NMAP, count); c = (int)(*bno + count - b); + nmap = min(XFS_BMAP_MAX_NMAP, c); error = xfs_bmapi_write(tp, dp, b, c, xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, args->total, &mapp[mapi], &nmap);
From: Allison Henderson allison.henderson@oracle.com
mainline inclusion from mainline-v6.1-rc1 commit e07ee6fe21f47cfd72ae566395c67a80e7c66163 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
xfs_rename can update up to 5 inodes: src_dp, target_dp, src_ip, target_ip and wip. So we need to increase the inode reservation to match.
Signed-off-by: Allison Henderson allison.henderson@oracle.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Ye Bin yebin10@huawei.com Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/libxfs/xfs_trans_resv.c | 4 ++-- fs/xfs/xfs_inode.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 2db9d9d12344..8d2a3c339167 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -330,7 +330,7 @@ xfs_calc_itruncate_reservation(
/* * In renaming a files we can modify: - * the four inodes involved: 4 * inode size + * the five inodes involved: 5 * inode size * the two directory btrees: 2 * (max depth + v2) * dir block size * the two directory bmap btrees: 2 * max depth * block size * And the bmap_finish transaction can free dir and bmap blocks (two sets @@ -345,7 +345,7 @@ xfs_calc_rename_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - max((xfs_calc_inode_res(mp, 4) + + max((xfs_calc_inode_res(mp, 5) + xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 49021758616d..ea360c9c223d 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3284,7 +3284,7 @@ xfs_rename( * Lock all the participating inodes. Depending upon whether * the target_name exists in the target directory, and * whether the target directory is the same as the source - * directory, we can lock from 2 to 4 inodes. + * directory, we can lock from 2 to 5 inodes. */ xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
From: Dave Chinner dchinner@redhat.com
mainline inclusion from mainline-v5.17-rc6 commit 8eda87211097195d96d7d12be37dd39d6a7c8b80 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The AIL operates purely on log items, so it is a log centric subsystem. Divorce it from the xfs_mount and instead have it pass around xlog pointers.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Chandan Babu R chandan.babu@oracle.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/xfs_trans.c | 2 +- fs/xfs/xfs_trans_ail.c | 26 +++++++++++++------------- fs/xfs/xfs_trans_priv.h | 3 ++- 3 files changed, 16 insertions(+), 15 deletions(-)
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 2832e49cc5d2..23cddfec03ef 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -776,7 +776,7 @@ xfs_trans_committed_bulk( * object into the AIL as we are in a shutdown situation. */ if (aborted) { - ASSERT(xfs_is_shutdown(ailp->ail_mount)); + ASSERT(xlog_is_shutdown(ailp->ail_log)); if (lip->li_ops->iop_unpin) lip->li_ops->iop_unpin(lip, 1); continue; diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 1b52952097c1..c2ccb98c7bcd 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -398,7 +398,7 @@ xfsaild_push_item( * If log item pinning is enabled, skip the push and track the item as * pinned. This can help induce head-behind-tail conditions. */ - if (XFS_TEST_ERROR(false, ailp->ail_mount, XFS_ERRTAG_LOG_ITEM_PIN)) + if (XFS_TEST_ERROR(false, ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN)) return XFS_ITEM_PINNED;
/* @@ -418,7 +418,7 @@ static long xfsaild_push( struct xfs_ail *ailp) { - xfs_mount_t *mp = ailp->ail_mount; + struct xfs_mount *mp = ailp->ail_log->l_mp; struct xfs_ail_cursor cur; struct xfs_log_item *lip; xfs_lsn_t lsn; @@ -443,7 +443,7 @@ xfsaild_push( ailp->ail_log_flush = 0;
XFS_STATS_INC(mp, xs_push_ail_flush); - xlog_cil_flush(mp->m_log); + xlog_cil_flush(ailp->ail_log); }
spin_lock(&ailp->ail_lock); @@ -632,7 +632,7 @@ xfsaild( * opportunity to release such buffers from the queue. */ ASSERT(list_empty(&ailp->ail_buf_list) || - xfs_is_shutdown(ailp->ail_mount)); + xlog_is_shutdown(ailp->ail_log)); xfs_buf_delwri_cancel(&ailp->ail_buf_list); break; } @@ -695,7 +695,7 @@ xfs_ail_push( struct xfs_log_item *lip;
lip = xfs_ail_min(ailp); - if (!lip || xfs_is_shutdown(ailp->ail_mount) || + if (!lip || xlog_is_shutdown(ailp->ail_log) || XFS_LSN_CMP(threshold_lsn, ailp->ail_target) <= 0) return;
@@ -751,7 +751,7 @@ xfs_ail_update_finish( struct xfs_ail *ailp, xfs_lsn_t old_lsn) __releases(ailp->ail_lock) { - struct xfs_mount *mp = ailp->ail_mount; + struct xlog *log = ailp->ail_log;
/* if the tail lsn hasn't changed, don't do updates or wakeups. */ if (!old_lsn || old_lsn == __xfs_ail_min_lsn(ailp)) { @@ -759,13 +759,13 @@ xfs_ail_update_finish( return; }
- if (!xfs_is_shutdown(mp)) - xlog_assign_tail_lsn_locked(mp); + if (!xlog_is_shutdown(log)) + xlog_assign_tail_lsn_locked(log->l_mp);
if (list_empty(&ailp->ail_head)) wake_up_all(&ailp->ail_empty); spin_unlock(&ailp->ail_lock); - xfs_log_space_wake(mp); + xfs_log_space_wake(log->l_mp); }
/* @@ -873,13 +873,13 @@ xfs_trans_ail_delete( int shutdown_type) { struct xfs_ail *ailp = lip->li_ailp; - struct xfs_mount *mp = ailp->ail_mount; + struct xfs_mount *mp = ailp->ail_log->l_mp; xfs_lsn_t tail_lsn;
spin_lock(&ailp->ail_lock); if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) { spin_unlock(&ailp->ail_lock); - if (shutdown_type && !xfs_is_shutdown(mp)) { + if (shutdown_type && !xlog_is_shutdown(ailp->ail_log)) { xfs_alert_tag(mp, XFS_PTAG_AILDELETE, "%s: attempting to delete a log item that is not in the AIL", __func__); @@ -904,7 +904,7 @@ xfs_trans_ail_init( if (!ailp) return -ENOMEM;
- ailp->ail_mount = mp; + ailp->ail_log = mp->m_log; INIT_LIST_HEAD(&ailp->ail_head); INIT_LIST_HEAD(&ailp->ail_cursors); spin_lock_init(&ailp->ail_lock); @@ -912,7 +912,7 @@ xfs_trans_ail_init( init_waitqueue_head(&ailp->ail_empty);
ailp->ail_task = kthread_run(xfsaild, ailp, "xfsaild/%s", - ailp->ail_mount->m_super->s_id); + mp->m_super->s_id); if (IS_ERR(ailp->ail_task)) goto out_free_ailp;
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 3004aeac9110..f0d79a9050ba 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -6,6 +6,7 @@ #ifndef __XFS_TRANS_PRIV_H__ #define __XFS_TRANS_PRIV_H__
+struct xlog; struct xfs_log_item; struct xfs_mount; struct xfs_trans; @@ -50,7 +51,7 @@ struct xfs_ail_cursor { * Eventually we need to drive the locking in here as well. */ struct xfs_ail { - struct xfs_mount *ail_mount; + struct xlog *ail_log; struct task_struct *ail_task; struct list_head ail_head; xfs_lsn_t ail_target;
From: Dave Chinner dchinner@redhat.com
mainline inclusion from mainline-v5.17-rc6 commit ab9c81ef321f90dd208b1d4809c196c2794e4b15 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
generic/388 triggered a failure in RUI recovery due to a corrupted btree record and the system then locked up hard due to a subsequent assert failure while holding a spinlock cancelling intents:
XFS (pmem1): Corruption of in-memory data (0x8) detected at xfs_do_force_shutdown+0x1a/0x20 (fs/xfs/xfs_trans.c:964). Shutting down filesystem. XFS (pmem1): Please unmount the filesystem and rectify the problem(s) XFS: Assertion failed: !xlog_item_is_intent(lip), file: fs/xfs/xfs_log_recover.c, line: 2632 Call Trace: <TASK> xlog_recover_cancel_intents.isra.0+0xd1/0x120 xlog_recover_finish+0xb9/0x110 xfs_log_mount_finish+0x15a/0x1e0 xfs_mountfs+0x540/0x910 xfs_fs_fill_super+0x476/0x830 get_tree_bdev+0x171/0x270 ? xfs_init_fs_context+0x1e0/0x1e0 xfs_fs_get_tree+0x15/0x20 vfs_get_tree+0x24/0xc0 path_mount+0x304/0xba0 ? putname+0x55/0x60 __x64_sys_mount+0x108/0x140 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae
Essentially, there's dirty metadata in the AIL from intent recovery transactions, so when we go to cancel the remaining intents we assume that all objects after the first non-intent log item in the AIL are not intents.
This is not true. Intent recovery can log new intents to continue the operations the original intent could not complete in a single transaction. The new intents are committed before they are deferred, which means if the CIL commits in the background they will get inserted into the AIL at the head.
Hence if we shut down the filesystem while processing intent recovery, the AIL may have new intents active at the current head. Hence this check:
/* * We're done when we see something other than an intent. * There should be no intents left in the AIL now. */ if (!xlog_item_is_intent(lip)) { for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) ASSERT(!xlog_item_is_intent(lip)); break; }
in both xlog_recover_process_intents() and log_recover_cancel_intents() is simply not valid. It was valid back when we only had EFI/EFD intents and didn't chain intents, but it hasn't been valid ever since intent recovery could create and commit new intents.
Given that crashing the mount task like this pretty much prevents diagnosing what went wrong that lead to the initial failure that triggered intent cancellation, just remove the checks altogether.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/xfs_log_recover.c | 50 ++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 33 deletions(-)
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 01fc8247c3af..96490b99b89a 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2497,21 +2497,22 @@ xlog_abort_defer_ops( xfs_defer_ops_release(mp, dfc); } } + /* * When this is called, all of the log intent items which did not have - * corresponding log done items should be in the AIL. What we do now - * is update the data structures associated with each one. + * corresponding log done items should be in the AIL. What we do now is update + * the data structures associated with each one. * - * Since we process the log intent items in normal transactions, they - * will be removed at some point after the commit. This prevents us - * from just walking down the list processing each one. We'll use a - * flag in the intent item to skip those that we've already processed - * and use the AIL iteration mechanism's generation count to try to - * speed this up at least a bit. + * Since we process the log intent items in normal transactions, they will be + * removed at some point after the commit. This prevents us from just walking + * down the list processing each one. We'll use a flag in the intent item to + * skip those that we've already processed and use the AIL iteration mechanism's + * generation count to try to speed this up at least a bit. * - * When we start, we know that the intents are the only things in the - * AIL. As we process them, however, other items are added to the - * AIL. + * When we start, we know that the intents are the only things in the AIL. As we + * process them, however, other items are added to the AIL. Hence we know we + * have started recovery on all the pending intents when we find an non-intent + * item in the AIL. */ STATIC int xlog_recover_process_intents( @@ -2534,17 +2535,8 @@ xlog_recover_process_intents( for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL; lip = xfs_trans_ail_cursor_next(ailp, &cur)) { - /* - * We're done when we see something other than an intent. - * There should be no intents left in the AIL now. - */ - if (!xlog_item_is_intent(lip)) { -#ifdef DEBUG - for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) - ASSERT(!xlog_item_is_intent(lip)); -#endif + if (!xlog_item_is_intent(lip)) break; - }
/* * We should never see a redo item with a LSN higher than @@ -2582,8 +2574,9 @@ xlog_recover_process_intents( }
/* - * A cancel occurs when the mount has failed and we're bailing out. - * Release all pending log intent items so they don't pin the AIL. + * A cancel occurs when the mount has failed and we're bailing out. Release all + * pending log intent items that we haven't started recovery on so they don't + * pin the AIL. */ STATIC void xlog_recover_cancel_intents( @@ -2597,17 +2590,8 @@ xlog_recover_cancel_intents( spin_lock(&ailp->ail_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); while (lip != NULL) { - /* - * We're done when we see something other than an intent. - * There should be no intents left in the AIL now. - */ - if (!xlog_item_is_intent(lip)) { -#ifdef DEBUG - for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) - ASSERT(!xlog_item_is_intent(lip)); -#endif + if (!xlog_item_is_intent(lip)) break; - }
spin_unlock(&ailp->ail_lock); lip->li_ops->iop_release(lip);
From: Dave Chinner dchinner@redhat.com
mainline inclusion from mainline-v5.17-rc6 commit b5f17bec1213a3ed2f4d79ad4c566e00cabe2a9b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
We've got a mess on our hands.
1. xfs_trans_commit() cannot cancel transactions because the mount is shut down - that causes dirty, aborted, unlogged log items to sit unpinned in memory and potentially get written to disk before the log is shut down. Hence xfs_trans_commit() can only abort transactions when xlog_is_shutdown() is true.
2. xfs_force_shutdown() is used in places to cause the current modification to be aborted via xfs_trans_commit() because it may be impractical or impossible to cancel the transaction directly, and hence xfs_trans_commit() must cancel transactions when xfs_is_shutdown() is true in this situation. But we can't do that because of #1.
3. Log IO errors cause log shutdowns by calling xfs_force_shutdown() to shut down the mount and then the log from log IO completion.
4. xfs_force_shutdown() can result in a log force being issued, which has to wait for log IO completion before it will mark the log as shut down. If #3 races with some other shutdown trigger that runs a log force, we rely on xfs_force_shutdown() silently ignoring #3 and avoiding shutting down the log until the failed log force completes.
5. To ensure #2 always works, we have to ensure that xfs_force_shutdown() does not return until the the log is shut down. But in the case of #4, this will result in a deadlock because the log Io completion will block waiting for a log force to complete which is blocked waiting for log IO to complete....
So the very first thing we have to do here to untangle this mess is dissociate log shutdown triggers from mount shutdowns. We already have xlog_forced_shutdown, which will atomically transistion to the log a shutdown state. Due to internal asserts it cannot be called multiple times, but was done simply because the only place that could call it was xfs_do_force_shutdown() (i.e. the mount shutdown!) and that could only call it once and once only. So the first thing we do is remove the asserts.
We then convert all the internal log shutdown triggers to call xlog_force_shutdown() directly instead of xfs_force_shutdown(). This allows the log shutdown triggers to shut down the log without needing to care about mount based shutdown constraints. This means we shut down the log independently of the mount and the mount may not notice this until it's next attempt to read or modify metadata. At that point (e.g. xfs_trans_commit()) it will see that the log is shutdown, error out and shutdown the mount.
To ensure that all the unmount behaviours and asserts track correctly as a result of a log shutdown, propagate the shutdown up to the mount if it is not already set. This keeps the mount and log state in sync, and saves a huge amount of hassle where code fails because of a log shutdown but only checks for mount shutdowns and hence ends up doing the wrong thing. Cleaning up that mess is an exercise for another day.
This enables us to address the other problems noted above in followup patches.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/xfs_log.c | 32 +++++++++++++++++++++++--------- fs/xfs/xfs_log_cil.c | 4 ++-- fs/xfs/xfs_log_recover.c | 6 +++--- fs/xfs/xfs_mount.c | 1 + fs/xfs/xfs_trans_ail.c | 8 ++++---- 5 files changed, 33 insertions(+), 18 deletions(-)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index d3b4d95c62fc..13e49b422286 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1307,7 +1307,7 @@ xlog_ioend_work( */ if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { xfs_alert(log->l_mp, "log I/O error %d", error); - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); }
xlog_state_done_syncing(iclog); @@ -1826,7 +1826,7 @@ xlog_write_iclog( iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) { - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); return; } if (is_vmalloc_addr(iclog->ic_data)) @@ -2396,7 +2396,7 @@ xlog_write( xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, "ctx ticket reservation ran out. Need to up reservation"); xlog_print_tic_res(log->l_mp, ticket); - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); }
len = xlog_write_calc_vec_length(ticket, log_vector, optype); @@ -3730,9 +3730,10 @@ xlog_verify_iclog( #endif
/* - * Perform a forced shutdown on the log. This should be called once and once - * only by the high level filesystem shutdown code to shut the log subsystem - * down cleanly. + * Perform a forced shutdown on the log. + * + * This can be called from low level log code to trigger a shutdown, or from the + * high level mount shutdown code when the mount shuts down. * * Our main objectives here are to make sure that: * a. if the shutdown was not due to a log IO error, flush the logs to @@ -3741,6 +3742,8 @@ xlog_verify_iclog( * parties to find out. Nothing new gets queued after this is done. * c. Tasks sleeping on log reservations, pinned objects and * other resources get woken up. + * d. The mount is also marked as shut down so that log triggered shutdowns + * still behave the same as if they called xfs_forced_shutdown(). * * Return true if the shutdown cause was a log IO error and we actually shut the * log down. @@ -3759,8 +3762,6 @@ xlog_force_shutdown( if (!log || xlog_in_recovery(log)) return false;
- ASSERT(!xlog_is_shutdown(log)); - /* * Flush all the completed transactions to disk before marking the log * being shut down. We need to do this first as shutting down the log @@ -3787,11 +3788,24 @@ xlog_force_shutdown( spin_lock(&log->l_icloglock); if (test_and_set_bit(XLOG_IO_ERROR, &log->l_opstate)) { spin_unlock(&log->l_icloglock); - ASSERT(0); return false; } spin_unlock(&log->l_icloglock);
+ /* + * If this log shutdown also sets the mount shutdown state, issue a + * shutdown warning message. + */ + if (!test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &log->l_mp->m_opstate)) { + xfs_alert_tag(log->l_mp, XFS_PTAG_SHUTDOWN_LOGERROR, +"Filesystem has been shut down due to log error (0x%x).", + shutdown_flags); + xfs_alert(log->l_mp, +"Please unmount the filesystem and rectify the problem(s)."); + if (xfs_error_level >= XFS_ERRLEVEL_HIGH) + xfs_stack_trace(); + } + /* * We don't want anybody waiting for log reservations after this. That * means we have to wake up everybody queued up on reserveq as well as diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index fed5fa879e0d..4e55138eed8a 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -540,7 +540,7 @@ xlog_cil_insert_items( spin_unlock(&cil->xc_cil_lock);
if (tp->t_ticket->t_curr_res < 0) - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); }
static void @@ -864,7 +864,7 @@ xlog_cil_write_commit_record(
error = xlog_write(log, ctx, &vec, ctx->ticket, XLOG_COMMIT_TRANS); if (error) - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); return error; }
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 96490b99b89a..7d2a788b3e7e 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2459,7 +2459,7 @@ xlog_finish_defer_ops( error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres, dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp); if (error) { - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(mp->m_log, SHUTDOWN_LOG_IO_ERROR); return error; }
@@ -3427,7 +3427,7 @@ xlog_recover_finish( */ xlog_recover_cancel_intents(log); xfs_alert(log->l_mp, "Failed to recover intents"); - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); return error; }
@@ -3459,7 +3459,7 @@ xlog_recover_finish( * end of intents processing can be pushed through the CIL * and AIL. */ - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); }
return 0; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 45f6d9a7e8ba..69888dd863d7 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -21,6 +21,7 @@ #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_fsops.h" diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index c2ccb98c7bcd..d3a97a028560 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -873,17 +873,17 @@ xfs_trans_ail_delete( int shutdown_type) { struct xfs_ail *ailp = lip->li_ailp; - struct xfs_mount *mp = ailp->ail_log->l_mp; + struct xlog *log = ailp->ail_log; xfs_lsn_t tail_lsn;
spin_lock(&ailp->ail_lock); if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) { spin_unlock(&ailp->ail_lock); - if (shutdown_type && !xlog_is_shutdown(ailp->ail_log)) { - xfs_alert_tag(mp, XFS_PTAG_AILDELETE, + if (shutdown_type && !xlog_is_shutdown(log)) { + xfs_alert_tag(log->l_mp, XFS_PTAG_AILDELETE, "%s: attempting to delete a log item that is not in the AIL", __func__); - xfs_force_shutdown(mp, shutdown_type); + xlog_force_shutdown(log, shutdown_type); } return; }
From: Dave Chinner dchinner@redhat.com
mainline inclusion from mainline-v5.17-rc6 commit 41e6362183589afd2cd51d653e277d256daab11f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
When we call xfs_forced_shutdown(), the caller often expects the filesystem to be completely shut down when it returns. However, if we have racing xfs_forced_shutdown() calls, the first caller sets the mount shutdown flag then goes to shutdown the log. The second caller sees the mount shutdown flag and returns immediately - it does not wait for the log to be shut down.
Unfortunately, xfs_forced_shutdown() is used in some places that expect it to completely shut down the filesystem before it returns (e.g. xfs_trans_log_inode()). As such, returning before the log has been shut down leaves us in a place where the transaction failed to complete correctly but we still call xfs_trans_commit(). This situation arises because xfs_trans_log_inode() does not return an error and instead calls xfs_force_shutdown() to ensure that the transaction being committed is aborted.
Unfortunately, we have a race condition where xfs_trans_commit() needs to check xlog_is_shutdown() because it can't abort log items before the log is shut down, but it needs to use xfs_is_shutdown() because xfs_forced_shutdown() does not block waiting for the log to shut down.
To fix this conundrum, first we make all calls to xfs_forced_shutdown() block until the log is also shut down. This means we can then safely use xfs_forced_shutdown() as a mechanism that ensures the currently running transaction will be aborted by xfs_trans_commit() regardless of the shutdown check it uses.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/xfs_fsops.c | 6 +++++- fs/xfs/xfs_log.c | 1 + fs/xfs/xfs_log_priv.h | 11 +++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 1d934923e52a..1430b2b9c466 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -17,6 +17,7 @@ #include "xfs_fsops.h" #include "xfs_trans_space.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #include "xfs_ag.h" #include "xfs_ag_resv.h"
@@ -465,8 +466,11 @@ xfs_do_force_shutdown( int tag; const char *why;
- if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate)) + + if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate)) { + xlog_shutdown_wait(mp->m_log); return; + } if (mp->m_sb_bp) mp->m_sb_bp->b_flags |= XBF_DONE;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 13e49b422286..544539ab3c94 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3831,6 +3831,7 @@ xlog_force_shutdown( xlog_state_shutdown_callbacks(log); spin_unlock(&log->l_icloglock);
+ wake_up_var(&log->l_opstate); return log_error; }
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index eee173bdc8f4..ecb9ec8a4d05 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -478,6 +478,17 @@ xlog_is_shutdown(struct xlog *log) return test_bit(XLOG_IO_ERROR, &log->l_opstate); }
+/* + * Wait until the xlog_force_shutdown() has marked the log as shut down + * so xlog_is_shutdown() will always return true. + */ +static inline void +xlog_shutdown_wait( + struct xlog *log) +{ + wait_var_event(&log->l_opstate, xlog_is_shutdown(log)); +} + /* common routines */ extern int xlog_recover(
From: Dave Chinner dchinner@redhat.com
mainline inclusion from mainline-v5.17-rc6 commit 3c4cb76bce4380aee99c275b3920049350939e47 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
If a shut races with xfs_trans_commit() and we have shut down the filesystem but not the log, we will still cancel the transaction. This can result in aborting dirty log items instead of committing and pinning them whilst the log is still running. Hence we can end up with dirty, unlogged metadata that isn't in the AIL in memory that can be flushed to disk via writeback clustering.
This was discovered from a g/388 trace where an inode log item was having IO completed on it and it wasn't in the AIL, hence tripping asserts xfs_ail_check(). Inode cluster writeback started long after the filesystem shutdown started, and long after the transaction containing the dirty inode was aborted and the log item marked XFS_LI_ABORTED. The inode was seen as dirty and unpinned, so it was flushed. IO completion tried to remove the inode from the AIL, at which point stuff went bad:
XFS (pmem1): Log I/O Error (0x6) detected at xfs_fs_goingdown+0xa3/0xf0 (fs/xfs/xfs_fsops.c:500). Shutting down filesystem. XFS: Assertion failed: in_ail, file: fs/xfs/xfs_trans_ail.c, line: 67 XFS (pmem1): Please unmount the filesystem and rectify the problem(s) Workqueue: xfs-buf/pmem1 xfs_buf_ioend_work RIP: 0010:assfail+0x27/0x2d Call Trace: <TASK> xfs_ail_check+0xa8/0x180 xfs_ail_delete_one+0x3b/0xf0 xfs_buf_inode_iodone+0x329/0x3f0 xfs_buf_ioend+0x1f8/0x530 xfs_buf_ioend_work+0x15/0x20 process_one_work+0x1ac/0x390 worker_thread+0x56/0x3c0 kthread+0xf6/0x120 ret_from_fork+0x1f/0x30 </TASK>
xfs_trans_commit() needs to check log state for shutdown, not mount state. It cannot abort dirty log items while the log is still running as dirty items must remained pinned in memory until they are either committed to the journal or the log has shut down and they can be safely tossed away. Hence if the log has not shut down, the xfs_trans_commit() path must allow completed transactions to commit to the CIL and pin the dirty items even if a mount shutdown has started.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/xfs_trans.c | 48 +++++++++++++++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 15 deletions(-)
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 23cddfec03ef..b29f60cf1909 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -837,6 +837,7 @@ __xfs_trans_commit( bool regrant) { struct xfs_mount *mp = tp->t_mountp; + struct xlog *log = mp->m_log; xfs_csn_t commit_seq = 0; int error = 0; int sync = tp->t_flags & XFS_TRANS_SYNC; @@ -865,7 +866,13 @@ __xfs_trans_commit( if (!(tp->t_flags & XFS_TRANS_DIRTY)) goto out_unreserve;
- if (xfs_is_shutdown(mp)) { + /* + * We must check against log shutdown here because we cannot abort log + * items and leave them dirty, inconsistent and unpinned in memory while + * the log is active. This leaves them open to being written back to + * disk, and that will lead to on-disk corruption. + */ + if (xlog_is_shutdown(log)) { error = -EIO; goto out_unreserve; } @@ -879,7 +886,7 @@ __xfs_trans_commit( xfs_trans_apply_sb_deltas(tp); xfs_trans_apply_dquot_deltas(tp);
- xlog_cil_commit(mp->m_log, tp, &commit_seq, regrant); + xlog_cil_commit(log, tp, &commit_seq, regrant);
xfs_trans_free(tp);
@@ -906,10 +913,10 @@ __xfs_trans_commit( */ xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - if (regrant && !xlog_is_shutdown(mp->m_log)) - xfs_log_ticket_regrant(mp->m_log, tp->t_ticket); + if (regrant && !xlog_is_shutdown(log)) + xfs_log_ticket_regrant(log, tp->t_ticket); else - xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); + xfs_log_ticket_ungrant(log, tp->t_ticket); tp->t_ticket = NULL; } xfs_trans_free_items(tp, !!error); @@ -927,18 +934,27 @@ xfs_trans_commit( }
/* - * Unlock all of the transaction's items and free the transaction. - * The transaction must not have modified any of its items, because - * there is no way to restore them to their previous state. + * Unlock all of the transaction's items and free the transaction. If the + * transaction is dirty, we must shut down the filesystem because there is no + * way to restore them to their previous state. * - * If the transaction has made a log reservation, make sure to release - * it as well. + * If the transaction has made a log reservation, make sure to release it as + * well. + * + * This is a high level function (equivalent to xfs_trans_commit()) and so can + * be called after the transaction has effectively been aborted due to the mount + * being shut down. However, if the mount has not been shut down and the + * transaction is dirty we will shut the mount down and, in doing so, that + * guarantees that the log is shut down, too. Hence we don't need to be as + * careful with shutdown state and dirty items here as we need to be in + * xfs_trans_commit(). */ void xfs_trans_cancel( struct xfs_trans *tp) { struct xfs_mount *mp = tp->t_mountp; + struct xlog *log = mp->m_log; bool dirty = (tp->t_flags & XFS_TRANS_DIRTY);
trace_xfs_trans_cancel(tp, _RET_IP_); @@ -947,16 +963,18 @@ xfs_trans_cancel( xfs_defer_cancel(tp);
/* - * See if the caller is relying on us to shut down the - * filesystem. This happens in paths where we detect - * corruption and decide to give up. + * See if the caller is relying on us to shut down the filesystem. We + * only want an error report if there isn't already a shutdown in + * progress, so we only need to check against the mount shutdown state + * here. */ if (dirty && !xfs_is_shutdown(mp)) { XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } #ifdef DEBUG - if (!dirty && !xfs_is_shutdown(mp)) { + /* Log items need to be consistent until the log is shut down. */ + if (!dirty && !xlog_is_shutdown(log)) { struct xfs_log_item *lip;
list_for_each_entry(lip, &tp->t_items, li_trans) @@ -967,7 +985,7 @@ xfs_trans_cancel( xfs_trans_unreserve_and_mod_dquots(tp);
if (tp->t_ticket) { - xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); + xfs_log_ticket_ungrant(log, tp->t_ticket); tp->t_ticket = NULL; }
From: Dave Chinner dchinner@redhat.com
mainline inclusion from mainline-v5.17-rc6 commit 5652ef31705f240e1528fe5a45d99229752e1ec8 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
When a checkpoint writeback is run by log recovery, corruption propagated from the log can result in writeback verifiers failing and calling xfs_force_shutdown() from xfs_buf_delwri_submit_buffers().
This results in the mount being marked as shutdown, but the log does not get marked as shut down because:
/* * If this happens during log recovery then we aren't using the runtime * log mechanisms yet so there's nothing to shut down. */ if (!log || xlog_in_recovery(log)) return false;
If there are other buffers that then fail (say due to detecting the mount shutdown), they will now hang in xfs_do_force_shutdown() waiting for the log to shut down like this:
__schedule+0x30d/0x9e0 schedule+0x55/0xd0 xfs_do_force_shutdown+0x1cd/0x200 ? init_wait_var_entry+0x50/0x50 xfs_buf_ioend+0x47e/0x530 __xfs_buf_submit+0xb0/0x240 xfs_buf_delwri_submit_buffers+0xfe/0x270 xfs_buf_delwri_submit+0x3a/0xc0 xlog_do_recovery_pass+0x474/0x7b0 ? do_raw_spin_unlock+0x30/0xb0 xlog_do_log_recovery+0x91/0x140 xlog_do_recover+0x38/0x1e0 xlog_recover+0xdd/0x170 xfs_log_mount+0x17e/0x2e0 xfs_mountfs+0x457/0x930 xfs_fs_fill_super+0x476/0x830
xlog_force_shutdown() always needs to mark the log as shut down, regardless of whether recovery is in progress or not, so that multiple calls to xfs_force_shutdown() during recovery don't end up waiting for the log to be shut down like this.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/xfs_log.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 544539ab3c94..506bb43b1eda 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3755,11 +3755,7 @@ xlog_force_shutdown( { bool log_error = (shutdown_flags & SHUTDOWN_LOG_IO_ERROR);
- /* - * If this happens during log recovery then we aren't using the runtime - * log mechanisms yet so there's nothing to shut down. - */ - if (!log || xlog_in_recovery(log)) + if (!log) return false;
/* @@ -3768,10 +3764,16 @@ xlog_force_shutdown( * before the force will prevent the log force from flushing the iclogs * to disk. * - * Re-entry due to a log IO error shutdown during the log force is - * prevented by the atomicity of higher level shutdown code. + * When we are in recovery, there are no transactions to flush, and + * we don't want to touch the log because we don't want to perturb the + * current head/tail for future recovery attempts. Hence we need to + * avoid a log force in this case. + * + * If we are shutting down due to a log IO error, then we must avoid + * trying to write the log as that may just result in more IO errors and + * an endless shutdown/force loop. */ - if (!log_error) + if (!log_error && !xlog_in_recovery(log)) xfs_log_force(log->l_mp, XFS_LOG_SYNC);
/*
From: Dave Chinner dchinner@redhat.com
stable inclusion from stable-v5.10.167 commit 35f049abbae3954fe2c3d57d62fe6624d9b29df6 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=l...
--------------------------------
commit 472c6e46f589c26057596dcba160712a5b3e02c5 upstream.
[partial backport for dependency - xfs_ioc_space() still uses XFS_PREALLOC_SYNC]
Callers can acheive the same thing by calling xfs_log_force_inode() after making their modifications. There is no need for xfs_update_prealloc_flags() to do this.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Amir Goldstein amir73il@gmail.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Wang Hai wanghai38@huawei.com Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/xfs_file.c | 13 +++++++------ fs/xfs/xfs_pnfs.c | 6 ++++-- 2 files changed, 11 insertions(+), 8 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 62057d85895d..23ea3ebff48a 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -95,8 +95,6 @@ xfs_update_prealloc_flags( ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (flags & XFS_PREALLOC_SYNC) - xfs_trans_set_sync(tp); return xfs_trans_commit(tp); }
@@ -1044,9 +1042,6 @@ xfs_file_fallocate( } }
- if (file->f_flags & O_DSYNC) - flags |= XFS_PREALLOC_SYNC; - error = xfs_update_prealloc_flags(ip, flags); if (error) goto out_unlock; @@ -1068,8 +1063,14 @@ xfs_file_fallocate( * leave shifted extents past EOF and hence losing access to * the data that is contained within them. */ - if (do_file_insert) + if (do_file_insert) { error = xfs_insert_file_space(ip, offset, len); + if (error) + goto out_unlock; + } + + if (file->f_flags & O_DSYNC) + error = xfs_log_force_inode(ip);
out_unlock: xfs_iunlock(ip, iolock); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index cae7d1fd7988..4ca25193925a 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -164,10 +164,12 @@ xfs_fs_map_blocks( * that the blocks allocated and handed out to the client are * guaranteed to be present even after a server crash. */ - error = xfs_update_prealloc_flags(ip, - XFS_PREALLOC_SET | XFS_PREALLOC_SYNC); + error = xfs_update_prealloc_flags(ip, XFS_PREALLOC_SET); + if (!error) + error = xfs_log_force_inode(ip); if (error) goto out_unlock; + } else { xfs_iunlock(ip, lock_flags); }
From: Dave Chinner dchinner@redhat.com
stable inclusion from stable-v5.10.167 commit 308dfe49eb753d7263fa4e001251d733c710ed14 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=l...
--------------------------------
commit fbe7e520036583a783b13ff9744e35c2a329d9a4 upstream.
In XFS, we always update the inode change and modification time when any fallocate() operation succeeds. Furthermore, as various fallocate modes can change the file contents (extending EOF, punching holes, zeroing things, shifting extents), we should drop file privileges like suid just like we do for a regular write(). There's already a VFS helper that figures all this out for us, so use that.
The net effect of this is that we no longer drop suid/sgid if the caller is root, but we also now drop file capabilities.
We also move the xfs_update_prealloc_flags() function so that it now is only called by the scope that needs to set the the prealloc flag.
Based on a patch from Darrick Wong.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Amir Goldstein amir73il@gmail.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Wang Hai wanghai38@huawei.com Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/xfs_file.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 23ea3ebff48a..2544aa3089d5 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -939,6 +939,10 @@ xfs_file_fallocate( goto out_unlock; }
+ error = file_modified(file); + if (error) + goto out_unlock; + if (mode & FALLOC_FL_PUNCH_HOLE) { error = xfs_free_file_space(ip, offset, len); if (error) @@ -1040,11 +1044,12 @@ xfs_file_fallocate( if (error) goto out_unlock; } - }
- error = xfs_update_prealloc_flags(ip, flags); - if (error) - goto out_unlock; + error = xfs_update_prealloc_flags(ip, XFS_PREALLOC_SET); + if (error) + goto out_unlock; + + }
/* Change file size if needed */ if (new_size) {
From: Dave Chinner dchinner@redhat.com
mainline inclusion from mainline-v5.14-rc4 commit 8cf07f3dd56195316be97758cb8b4e1d7183ea84 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The verifier checks explicitly for bp->b_bn == XFS_SB_DADDR to match the primary superblock buffer, but the primary superblock is an uncached buffer and so bp->b_bn is always -1ULL. Hence this never matches and the CRC error reporting is wholly dependent on the mount superblock already being populated so CRC feature checks pass and allow CRC errors to be reported.
Fix this so that the primary superblock CRC error reporting is not dependent on already having read the superblock into memory.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com
Conflicts: fs/xfs/libxfs/xfs_sb.c Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/libxfs/xfs_sb.c | 2 +- fs/xfs/xfs_buf.h | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 8a8e5050352d..d7ac96b137bd 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -809,7 +809,7 @@ xfs_sb_read_verify(
if (!xfs_buf_verify_cksum(bp, XFS_SB_CRC_OFF)) { /* Only fail bad secondaries on a known V5 filesystem */ - if (bp->b_bn == XFS_SB_DADDR || + if (bp->b_maps[0].bm_bn == XFS_SB_DADDR || xfs_has_crc(mp)) { error = -EFSBADCRC; goto out_error; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 200793275c36..23620d8312ee 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -133,7 +133,12 @@ typedef struct xfs_buf { * fast-path on locking. */ struct rhash_head b_rhash_head; /* pag buffer hash node */ - xfs_daddr_t b_bn; /* block number of buffer */ + + /* + * b_bn is the cache index. Do not use directly, use b_maps[0].bm_bn + * for the buffer disk address instead. + */ + xfs_daddr_t b_bn; int b_length; /* size of buffer in BBs */ atomic_t b_hold; /* reference count */ atomic_t b_lru_ref; /* lru reclaim ref count */
From: "Darrick J. Wong" djwong@kernel.org
mainline inclusion from mainline-v5.15-rc4 commit 78e8ec83a404d63dcc86b251f42e4ee8aff27465 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The btree geometry computation function has an off-by-one error in that it does not allow maximally tall btrees (nlevels == XFS_BTREE_MAXLEVELS). This can result in repairs failing unnecessarily on very fragmented filesystems. Subsequent patches to remove MAXLEVELS usage in favor of the per-btree type computations will make this a much more likely occurrence.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Chandan Babu R chandan.babu@oracle.com Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Guo Xuenan guoxuenan@huawei.com Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/libxfs/xfs_btree_staging.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index f464a7c7cf22..4873b8a065d4 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -663,7 +663,7 @@ xfs_btree_bload_compute_geometry( xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1);
bbl->nr_records = nr_this_level = nr_records; - for (cur->bc_nlevels = 1; cur->bc_nlevels < XFS_BTREE_MAXLEVELS;) { + for (cur->bc_nlevels = 1; cur->bc_nlevels <= XFS_BTREE_MAXLEVELS;) { uint64_t level_blocks; uint64_t dontcare64; unsigned int level = cur->bc_nlevels - 1; @@ -725,7 +725,7 @@ xfs_btree_bload_compute_geometry( nr_this_level = level_blocks; }
- if (cur->bc_nlevels == XFS_BTREE_MAXLEVELS) + if (cur->bc_nlevels > XFS_BTREE_MAXLEVELS) return -EOVERFLOW;
bbl->btree_height = cur->bc_nlevels;
From: "Darrick J. Wong" djwong@kernel.org
mainline inclusion from mainline-v5.16-rc5 commit 47a6df7cd3174b91c6c862eae0b8d4e13591df52 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
While debugging some very strange rmap corruption reports in connection with the online directory repair code. I root-caused the error to the following incorrect sequence:
<start repair transaction> <expand directory, causing a deferred rmap to be queued> <roll transaction> <cancel transaction>
Obviously, we should have committed the transaction instead of cancelling it. Thinking more broadly, however, xfs_trans_cancel should have warned us that we were throwing away work item that we already committed to performing. This is not correct, and we need to shut down the filesystem.
Change xfs_trans_cancel to complain in the loudest manner if we're cancelling any transaction with deferred work items attached.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/xfs_trans.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index b29f60cf1909..8b6617833c58 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -959,8 +959,17 @@ xfs_trans_cancel(
trace_xfs_trans_cancel(tp, _RET_IP_);
- if (tp->t_flags & XFS_TRANS_PERM_LOG_RES) + /* + * It's never valid to cancel a transaction with deferred ops attached, + * because the transaction is effectively dirty. Complain about this + * loudly before freeing the in-memory defer items. + */ + if (!list_empty(&tp->t_dfops)) { + ASSERT(xfs_is_shutdown(mp) || list_empty(&tp->t_dfops)); + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + dirty = true; xfs_defer_cancel(tp); + }
/* * See if the caller is relying on us to shut down the filesystem. We
From: "Darrick J. Wong" djwong@kernel.org
mainline inclusion from mainline-v5.16-rc5 commit 7b7820b83f230036fc48c3e7fb280c48c58adebf category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Ian Kent reported that for inline symlinks, it's possible for vfs_readlink to hang on to the target buffer returned by _vn_get_link_inline long after it's been freed by xfs inode reclaim. This is a layering violation -- we should never expose XFS internals to the VFS.
When the symlink has a remote target, we allocate a separate buffer, copy the internal information, and let the VFS manage the new buffer's lifetime. Let's adapt the inline code paths to do this too. It's less efficient, but fixes the layering violation and avoids the need to adapt the if_data lifetime to rcu rules. Clearly I don't care about readlink benchmarks.
As a side note, this fixes the minor locking violation where we can access the inode data fork without taking any locks; proper locking (and eliminating the possibility of having to switch inode_operations on a live inode) is essential to online repair coordinating repairs correctly.
Reported-by: Ian Kent raven@themaw.net Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com
Conflicts: fs/xfs/xfs_symlink.c Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/xfs_iops.c | 34 +--------------------------------- fs/xfs/xfs_symlink.c | 27 ++++++++++++++++++--------- 2 files changed, 19 insertions(+), 42 deletions(-)
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index a3b63ebc246b..cc478df14996 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -471,27 +471,6 @@ xfs_vn_get_link( return ERR_PTR(error); }
-STATIC const char * -xfs_vn_get_link_inline( - struct dentry *dentry, - struct inode *inode, - struct delayed_call *done) -{ - struct xfs_inode *ip = XFS_I(inode); - char *link; - - ASSERT(ip->i_df.if_flags & XFS_IFINLINE); - - /* - * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED if - * if_data is junk. - */ - link = ip->i_df.if_u1.if_data; - if (XFS_IS_CORRUPT(ip->i_mount, !link)) - return ERR_PTR(-EFSCORRUPTED); - return link; -} - static uint32_t xfs_stat_blksize( struct xfs_inode *ip) @@ -1211,14 +1190,6 @@ static const struct inode_operations xfs_symlink_inode_operations = { .update_time = xfs_vn_update_time, };
-static const struct inode_operations xfs_inline_symlink_inode_operations = { - .get_link = xfs_vn_get_link_inline, - .getattr = xfs_vn_getattr, - .setattr = xfs_vn_setattr, - .listxattr = xfs_vn_listxattr, - .update_time = xfs_vn_update_time, -}; - /* Figure out if this file actually supports DAX. */ static bool xfs_inode_supports_dax( @@ -1369,10 +1340,7 @@ xfs_setup_iops( inode->i_fop = &xfs_dir_file_operations; break; case S_IFLNK: - if (ip->i_df.if_flags & XFS_IFINLINE) - inode->i_op = &xfs_inline_symlink_inode_operations; - else - inode->i_op = &xfs_symlink_inode_operations; + inode->i_op = &xfs_symlink_inode_operations; break; default: inode->i_op = &xfs_inode_operations; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 5900f35482aa..2d944596def5 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -21,6 +21,7 @@ #include "xfs_trans_space.h" #include "xfs_trace.h" #include "xfs_trans.h" +#include "xfs_error.h"
/* ----- Kernel only functions below ----- */ int @@ -95,17 +96,15 @@ xfs_readlink_bmap_ilocked(
int xfs_readlink( - struct xfs_inode *ip, - char *link) + struct xfs_inode *ip, + char *link) { - struct xfs_mount *mp = ip->i_mount; - xfs_fsize_t pathlen; - int error = 0; + struct xfs_mount *mp = ip->i_mount; + xfs_fsize_t pathlen; + int error = -EFSCORRUPTED;
trace_xfs_readlink(ip);
- ASSERT(!(ip->i_df.if_flags & XFS_IFINLINE)); - if (xfs_is_shutdown(mp)) return -EIO;
@@ -120,12 +119,22 @@ xfs_readlink( __func__, (unsigned long long) ip->i_ino, (long long) pathlen); ASSERT(0); - error = -EFSCORRUPTED; goto out; }
+ if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { + /* + * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED + * if if_data is junk. + */ + if (XFS_IS_CORRUPT(ip->i_mount, !ip->i_df.if_u1.if_data)) + goto out;
- error = xfs_readlink_bmap_ilocked(ip, link); + memcpy(link, ip->i_df.if_u1.if_data, pathlen + 1); + error = 0; + } else { + error = xfs_readlink_bmap_ilocked(ip, link); + }
out: xfs_iunlock(ip, XFS_ILOCK_SHARED);
From: Guo Xuenan guoxuenan@huawei.com
Offering: HULK hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK
--------------------------------
This reverts commit 1768d0fc0015c38aaa6c0e609405a17d49a3f2a8.
Signed-off-by: Guo Xuenan guoxuenan@huawei.com Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/libxfs/xfs_dir2_leaf.c | 12 ------------ 1 file changed, 12 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index 69f37dcbdc12..4abf24c3aa94 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -815,18 +815,6 @@ xfs_dir2_leaf_addname( */ else xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block); - /* - * An abnormal corner case, bestfree count less than data - * blocks, add a condition to avoid UAF or slab-out-of bound. - */ - if ((char *)(&bestsp[use_block]) >= (char *)ltp) { - xfs_trans_brelse(tp, lbp); - if (tp->t_flags & XFS_TRANS_DIRTY) - xfs_force_shutdown(tp->t_mountp, - SHUTDOWN_CORRUPT_INCORE); - return -EFSCORRUPTED; - } - hdr = dbp->b_addr; bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr); bestsp[use_block] = bf[0].length;
From: Guo Xuenan guoxuenan@huawei.com
mainline inclusion from mainline-v6.1-rc1 commit 13cf24e00665c9751951a422756d975812b71173 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
For leaf dir, In most cases, there should be as many bestfree slots as the dir data blocks that can fit under i_size (except for [1]).
Root cause is we don't examin the number bestfree slots, when the slots number less than dir data blocks, if we need to allocate new dir data block and update the bestfree array, we will use the dir block number as index to assign bestfree array, while we did not check the leaf buf boundary which may cause UAF or other memory access problem. This issue can also triggered with test cases xfs/473 from fstests.
According to Dave Chinner & Darrick's suggestion, adding buffer verifier to detect this abnormal situation in time. Simplify the testcase for fstest xfs/554 [1]
The error log is shown as follows: ================================================================== BUG: KASAN: use-after-free in xfs_dir2_leaf_addname+0x1995/0x1ac0 Write of size 2 at addr ffff88810168b000 by task touch/1552 CPU: 5 PID: 1552 Comm: touch Not tainted 6.0.0-rc3+ #101 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Call Trace: <TASK> dump_stack_lvl+0x4d/0x66 print_report.cold+0xf6/0x691 kasan_report+0xa8/0x120 xfs_dir2_leaf_addname+0x1995/0x1ac0 xfs_dir_createname+0x58c/0x7f0 xfs_create+0x7af/0x1010 xfs_generic_create+0x270/0x5e0 path_openat+0x270b/0x3450 do_filp_open+0x1cf/0x2b0 do_sys_openat2+0x46b/0x7a0 do_sys_open+0xb7/0x130 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7fe4d9e9312b Code: 25 00 00 41 00 3d 00 00 41 00 74 4b 64 8b 04 25 18 00 00 00 85 c0 75 67 44 89 e2 48 89 ee bf 9c ff ff ff b8 01 01 00 00 0f 05 <48> 3d 00 f0 ff ff 0f 87 91 00 00 00 48 8b 4c 24 28 64 48 33 0c 25 RSP: 002b:00007ffda4c16c20 EFLAGS: 00000246 ORIG_RAX: 0000000000000101 RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007fe4d9e9312b RDX: 0000000000000941 RSI: 00007ffda4c17f33 RDI: 00000000ffffff9c RBP: 00007ffda4c17f33 R08: 0000000000000000 R09: 0000000000000000 R10: 00000000000001b6 R11: 0000000000000246 R12: 0000000000000941 R13: 00007fe4d9f631a4 R14: 00007ffda4c17f33 R15: 0000000000000000 </TASK>
The buggy address belongs to the physical page: page:ffffea000405a2c0 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x10168b flags: 0x2fffff80000000(node=0|zone=2|lastcpupid=0x1fffff) raw: 002fffff80000000 ffffea0004057788 ffffea000402dbc8 0000000000000000 raw: 0000000000000000 0000000000170000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected
Memory state around the buggy address: ffff88810168af00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffff88810168af80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
ffff88810168b000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
^ ffff88810168b080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff88810168b100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ================================================================== Disabling lock debugging due to kernel taint 00000000: 58 44 44 33 5b 53 35 c2 00 00 00 00 00 00 00 78 XDD3[S5........x XFS (sdb): Internal error xfs_dir2_data_use_free at line 1200 of file fs/xfs/libxfs/xfs_dir2_data.c. Caller xfs_dir2_data_use_free+0x28a/0xeb0 CPU: 5 PID: 1552 Comm: touch Tainted: G B 6.0.0-rc3+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Call Trace: <TASK> dump_stack_lvl+0x4d/0x66 xfs_corruption_error+0x132/0x150 xfs_dir2_data_use_free+0x198/0xeb0 xfs_dir2_leaf_addname+0xa59/0x1ac0 xfs_dir_createname+0x58c/0x7f0 xfs_create+0x7af/0x1010 xfs_generic_create+0x270/0x5e0 path_openat+0x270b/0x3450 do_filp_open+0x1cf/0x2b0 do_sys_openat2+0x46b/0x7a0 do_sys_open+0xb7/0x130 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7fe4d9e9312b Code: 25 00 00 41 00 3d 00 00 41 00 74 4b 64 8b 04 25 18 00 00 00 85 c0 75 67 44 89 e2 48 89 ee bf 9c ff ff ff b8 01 01 00 00 0f 05 <48> 3d 00 f0 ff ff 0f 87 91 00 00 00 48 8b 4c 24 28 64 48 33 0c 25 RSP: 002b:00007ffda4c16c20 EFLAGS: 00000246 ORIG_RAX: 0000000000000101 RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007fe4d9e9312b RDX: 0000000000000941 RSI: 00007ffda4c17f46 RDI: 00000000ffffff9c RBP: 00007ffda4c17f46 R08: 0000000000000000 R09: 0000000000000001 R10: 00000000000001b6 R11: 0000000000000246 R12: 0000000000000941 R13: 00007fe4d9f631a4 R14: 00007ffda4c17f46 R15: 0000000000000000 </TASK> XFS (sdb): Corruption detected. Unmount and run xfs_repair
[1] https://lore.kernel.org/all/20220928095355.2074025-1-guoxuenan@huawei.com/ Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Guo Xuenan guoxuenan@huawei.com Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/libxfs/xfs_dir2_leaf.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index 4abf24c3aa94..bb18323fd1ed 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -145,6 +145,8 @@ xfs_dir3_leaf_check_int( xfs_dir2_leaf_tail_t *ltp; int stale; int i; + bool isleaf1 = (hdr->magic == XFS_DIR2_LEAF1_MAGIC || + hdr->magic == XFS_DIR3_LEAF1_MAGIC);
ltp = xfs_dir2_leaf_tail_p(geo, leaf);
@@ -157,8 +159,7 @@ xfs_dir3_leaf_check_int( return __this_address;
/* Leaves and bests don't overlap in leaf format. */ - if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC || - hdr->magic == XFS_DIR3_LEAF1_MAGIC) && + if (isleaf1 && (char *)&hdr->ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp)) return __this_address;
@@ -171,6 +172,10 @@ xfs_dir3_leaf_check_int( } if (hdr->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; + if (isleaf1 && xfs_dir2_dataptr_to_db(geo, + be32_to_cpu(hdr->ents[i].address)) >= + be32_to_cpu(ltp->bestcount)) + return __this_address; } if (hdr->stale != stale) return __this_address;
From: Dave Chinner dchinner@redhat.com
mainline inclusion from mainline-v6.1-rc4 commit 118e021b4b66f758f8e8f21dc0e5e0a4c721e69e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
When we reserve a delalloc region in xfs_buffered_write_iomap_begin, we mark the iomap as IOMAP_F_NEW so that the the write context understands that it allocated the delalloc region.
If we then fail that buffered write, xfs_buffered_write_iomap_end() checks for the IOMAP_F_NEW flag and if it is set, it punches out the unused delalloc region that was allocated for the write.
The assumption this code makes is that all buffered write operations that can allocate space are run under an exclusive lock (i_rwsem). This is an invalid assumption: page faults in mmap()d regions call through this same function pair to map the file range being faulted and this runs only holding the inode->i_mapping->invalidate_lock in shared mode.
IOWs, we can have races between page faults and write() calls that fail the nested page cache write operation that result in data loss. That is, the failing iomap_end call will punch out the data that the other racing iomap iteration brought into the page cache. This can be reproduced with generic/34[46] if we arbitrarily fail page cache copy-in operations from write() syscalls.
Code analysis tells us that the iomap_page_mkwrite() function holds the already instantiated and uptodate folio locked across the iomap mapping iterations. Hence the folio cannot be removed from memory whilst we are mapping the range it covers, and as such we do not care if the mapping changes state underneath the iomap iteration loop:
1. if the folio is not already dirty, there is no writeback races possible. 2. if we allocated the mapping (delalloc or unwritten), the folio cannot already be dirty. See #1. 3. If the folio is already dirty, it must be up to date. As we hold it locked, it cannot be reclaimed from memory. Hence we always have valid data in the page cache while iterating the mapping. 4. Valid data in the page cache can exist when the underlying mapping is DELALLOC, UNWRITTEN or WRITTEN. Having the mapping change from DELALLOC->UNWRITTEN or UNWRITTEN->WRITTEN does not change the data in the page - it only affects actions if we are initialising a new page. Hence #3 applies and we don't care about these extent map transitions racing with iomap_page_mkwrite(). 5. iomap_page_mkwrite() checks for page invalidation races (truncate, hole punch, etc) after it locks the folio. We also hold the mapping->invalidation_lock here, and hence the mapping cannot change due to extent removal operations while we are iterating the folio.
As such, filesystems that don't use bufferheads will never fail the iomap_folio_mkwrite_iter() operation on the current mapping, regardless of whether the iomap should be considered stale.
Further, the range we are asked to iterate is limited to the range inside EOF that the folio spans. Hence, for XFS, we will only map the exact range we are asked for, and we will only do speculative preallocation with delalloc if we are mapping a hole at the EOF page. The iterator will consume the entire range of the folio that is within EOF, and anything beyond the EOF block cannot be accessed. We never need to truncate this post-EOF speculative prealloc away in the context of the iomap_page_mkwrite() iterator because if it remains unused we'll remove it when the last reference to the inode goes away.
Hence we don't actually need an .iomap_end() cleanup/error handling path at all for iomap_page_mkwrite() for XFS. This means we can separate the page fault processing from the complexity of the .iomap_end() processing in the buffered write path. This also means that the buffered write path will also be able to take the mapping->invalidate_lock as necessary.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Ye Bin yebin10@huawei.com Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/xfs_file.c | 2 +- fs/xfs/xfs_iomap.c | 9 +++++++++ fs/xfs/xfs_iomap.h | 1 + 3 files changed, 11 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 2544aa3089d5..a6a59bd6c189 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1327,7 +1327,7 @@ __xfs_filemap_fault( } else { if (write_fault) ret = iomap_page_mkwrite(vmf, - &xfs_buffered_write_iomap_ops); + &xfs_page_mkwrite_iomap_ops); else ret = filemap_fault(vmf); } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index f5fdb0438fcb..b13aa16adf58 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1131,6 +1131,15 @@ const struct iomap_ops xfs_buffered_write_iomap_ops = { .iomap_end = xfs_buffered_write_iomap_end, };
+/* + * iomap_page_mkwrite() will never fail in a way that requires delalloc extents + * that it allocated to be revoked. Hence we do not need an .iomap_end method + * for this operation. + */ +const struct iomap_ops xfs_page_mkwrite_iomap_ops = { + .iomap_begin = xfs_buffered_write_iomap_begin, +}; + static int xfs_read_iomap_begin( struct inode *inode, diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 7d3703556d0e..3626c9894bdb 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -41,6 +41,7 @@ xfs_aligned_fsb_count( }
extern const struct iomap_ops xfs_buffered_write_iomap_ops; +extern const struct iomap_ops xfs_page_mkwrite_iomap_ops; extern const struct iomap_ops xfs_direct_write_iomap_ops; extern const struct iomap_ops xfs_read_iomap_ops; extern const struct iomap_ops xfs_seek_iomap_ops;
From: Dave Chinner dchinner@redhat.com
mainline inclusion from mainline-v6.1-rc4 commit d7b64041164ca177170191d2ad775da074ab2926 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
A recent multithreaded write data corruption has been uncovered in the iomap write code. The core of the problem is partial folio writes can be flushed to disk while a new racing write can map it and fill the rest of the page:
writeback new write
allocate blocks blocks are unwritten submit IO ..... map blocks iomap indicates UNWRITTEN range loop { lock folio copyin data ..... IO completes runs unwritten extent conv blocks are marked written <iomap now stale> get next folio }
Now add memory pressure such that memory reclaim evicts the partially written folio that has already been written to disk.
When the new write finally gets to the last partial page of the new write, it does not find it in cache, so it instantiates a new page, sees the iomap is unwritten, and zeros the part of the page that it does not have data from. This overwrites the data on disk that was originally written.
The full description of the corruption mechanism can be found here:
https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.ar...
To solve this problem, we need to check whether the iomap is still valid after we lock each folio during the write. We have to do it after we lock the page so that we don't end up with state changes occurring while we wait for the folio to be locked.
Hence we need a mechanism to be able to check that the cached iomap is still valid (similar to what we already do in buffered writeback), and we need a way for ->begin_write to back out and tell the high level iomap iterator that we need to remap the remaining write range.
The iomap needs to grow some storage for the validity cookie that the filesystem provides to travel with the iomap. XFS, in particular, also needs to know some more information about what the iomap maps (attribute extents rather than file data extents) to for the validity cookie to cover all the types of iomaps we might need to validate.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Darrick J. Wong djwong@kernel.org
conflicts: include/linux/iomap.h fs/iomap/buffered-io.c fs/iomap/apply.c
Signed-off-by: Ye Bin yebin10@huawei.com Signed-off-by: Long Li leo.lilong@huawei.com --- fs/iomap/apply.c | 14 +++++++++++--- fs/iomap/buffered-io.c | 27 +++++++++++++++++++++++++++ include/linux/iomap.h | 39 +++++++++++++++++++++++++++++++++------ 3 files changed, 71 insertions(+), 9 deletions(-)
diff --git a/fs/iomap/apply.c b/fs/iomap/apply.c index 26ab6563181f..5595d51c3ca1 100644 --- a/fs/iomap/apply.c +++ b/fs/iomap/apply.c @@ -24,11 +24,16 @@ loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, const struct iomap_ops *ops, void *data, iomap_actor_t actor) { - struct iomap iomap = { .type = IOMAP_HOLE }; - struct iomap srcmap = { .type = IOMAP_HOLE }; - loff_t written = 0, ret; + struct iomap iomap; + struct iomap srcmap; + loff_t written, ret; u64 end;
+stale: + memset(&iomap, 0, sizeof(struct iomap)); + memset(&srcmap, 0, sizeof(struct iomap)); + written = 0; + trace_iomap_apply(inode, pos, length, flags, ops, actor, _RET_IP_);
/* @@ -95,5 +100,8 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, flags, &iomap); }
+ if (!ret && !written && iomap.flags & IOMAP_F_STALE) + goto stale; + return written ? written : ret; } diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 2209ce39511f..243de0dbc219 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -620,6 +620,25 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, goto out_no_page; }
+ /* + * Now we have a locked folio, before we do anything with it we need to + * check that the iomap we have cached is not stale. The inode extent + * mapping can change due to concurrent IO in flight (e.g. + * IOMAP_UNWRITTEN state can change and memory reclaim could have + * reclaimed a previously partially written page at this index after IO + * completion before this write reaches this file offset) and hence we + * could do the wrong thing here (zero a page range incorrectly or fail + * to zero) and corrupt data. + */ + if (page_ops && page_ops->iomap_valid) { + bool iomap_valid = page_ops->iomap_valid(inode, iomap); + if (!iomap_valid) { + iomap->flags |= IOMAP_F_STALE; + status = 0; + goto out_unlock; + } + } + if (srcmap->type == IOMAP_INLINE) iomap_read_inline_data(inode, page, srcmap); else if (iomap->flags & IOMAP_F_BUFFER_HEAD) @@ -786,6 +805,8 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, srcmap); if (unlikely(status)) break; + if (iomap->flags & IOMAP_F_STALE) + break;
if (mapping_writably_mapped(inode->i_mapping)) flush_dcache_page(page); @@ -863,6 +884,8 @@ iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data, IOMAP_WRITE_F_UNSHARE, &page, iomap, srcmap); if (unlikely(status)) return status; + if (iomap->flags & IOMAP_F_STALE) + break;
status = iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap); @@ -911,6 +934,8 @@ static s64 iomap_zero(struct inode *inode, loff_t pos, u64 length, status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, srcmap); if (status) return status; + if (iomap->flags & IOMAP_F_STALE) + return 0;
zero_user(page, offset, bytes); mark_page_accessed(page); @@ -938,6 +963,8 @@ static loff_t iomap_zero_range_actor(struct inode *inode, loff_t pos, bytes = iomap_zero(inode, pos, length, iomap, srcmap); if (bytes < 0) return bytes; + if (iomap->flags & IOMAP_F_STALE) + break;
pos += bytes; length -= bytes; diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 0c95321f42fd..78520f28806a 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -50,20 +50,29 @@ struct vm_fault; * * IOMAP_F_BUFFER_HEAD indicates that the file system requires the use of * buffer heads for this mapping. + * + * IOMAP_F_XATTR indicates that the iomap is for an extended attribute extent + * rather than a file data extent. */ -#define IOMAP_F_NEW 0x01 -#define IOMAP_F_DIRTY 0x02 -#define IOMAP_F_SHARED 0x04 -#define IOMAP_F_MERGED 0x08 -#define IOMAP_F_BUFFER_HEAD 0x10 +#define IOMAP_F_NEW (1U << 0) +#define IOMAP_F_DIRTY (1U << 1) +#define IOMAP_F_SHARED (1U << 2) +#define IOMAP_F_MERGED (1U << 3) +#define IOMAP_F_BUFFER_HEAD (1U << 4) +#define IOMAP_F_XATTR (1U << 6)
/* * Flags set by the core iomap code during operations: * * IOMAP_F_SIZE_CHANGED indicates to the iomap_end method that the file size * has changed as the result of this write operation. + * + * IOMAP_F_STALE indicates that the iomap is not valid any longer and the file + * range it covers needs to be remapped by the high level before the operation + * can proceed. */ -#define IOMAP_F_SIZE_CHANGED 0x100 +#define IOMAP_F_SIZE_CHANGED (1U << 8) +#define IOMAP_F_STALE (1U << 9)
/* * Flags from 0x1000 up are for file system specific usage: @@ -89,6 +98,7 @@ struct iomap { void *inline_data; void *private; /* filesystem private */ const struct iomap_page_ops *page_ops; + u64 validity_cookie; /* used with .iomap_valid() */ };
static inline sector_t @@ -112,6 +122,23 @@ struct iomap_page_ops { struct iomap *iomap); void (*page_done)(struct inode *inode, loff_t pos, unsigned copied, struct page *page, struct iomap *iomap); + + /* + * Check that the cached iomap still maps correctly to the filesystem's + * internal extent map. FS internal extent maps can change while iomap + * is iterating a cached iomap, so this hook allows iomap to detect that + * the iomap needs to be refreshed during a long running write + * operation. + * + * The filesystem can store internal state (e.g. a sequence number) in + * iomap->validity_cookie when the iomap is first mapped to be able to + * detect changes between mapping time and whenever .iomap_valid() is + * called. + * + * This is called with the folio over the specified file position held + * locked by the iomap code. + */ + bool (*iomap_valid)(struct inode *inode, const struct iomap *iomap); };
/*
From: Dave Chinner dchinner@redhat.com
mainline inclusion from mainline-v6.1-rc4 commit 304a68b9c63bbfc1f6e159d68e8892fc54a06067 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Now that iomap supports a mechanism to validate cached iomaps for buffered write operations, hook it up to the XFS buffered write ops so that we can avoid data corruptions that result from stale cached iomaps. See:
https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.ar...
or the ->iomap_valid() introduction commit for exact details of the corruption vector.
The validity cookie we store in the iomap is based on the type of iomap we return. It is expected that the iomap->flags we set in xfs_bmbt_to_iomap() is not perturbed by the iomap core and are returned to us in the iomap passed via the .iomap_valid() callback. This ensures that the validity cookie is always checking the correct inode fork sequence numbers to detect potential changes that affect the extent cached by the iomap.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org
conflicts: fs/xfs/libxfs/xfs_bmap.c fs/xfs/xfs_aops.c fs/xfs/xfs_iomap.c fs/xfs/xfs_iomap.h fs/xfs/xfs_pnfs.c
Signed-off-by: Ye Bin yebini10@huawei.com Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/libxfs/xfs_bmap.c | 6 ++- fs/xfs/xfs_aops.c | 2 +- fs/xfs/xfs_iomap.c | 96 +++++++++++++++++++++++++++++++--------- fs/xfs/xfs_iomap.h | 6 ++- fs/xfs/xfs_pnfs.c | 6 ++- 5 files changed, 89 insertions(+), 27 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index f111da48e75c..c141eb71575f 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4539,7 +4539,8 @@ xfs_bmapi_convert_delalloc( * the extent. Just return the real extent at this offset. */ if (!isnullstartblock(bma.got.br_startblock)) { - xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags); + xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags, + xfs_iomap_inode_sequence(ip, flags)); *seq = READ_ONCE(ifp->if_seq); goto out_trans_cancel; } @@ -4586,7 +4587,8 @@ xfs_bmapi_convert_delalloc( XFS_STATS_INC(mp, xs_xstrat_quick);
ASSERT(!isnullstartblock(bma.got.br_startblock)); - xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags); + xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags, + xfs_iomap_inode_sequence(ip, flags)); *seq = READ_ONCE(ifp->if_seq);
if (whichfork == XFS_COW_FORK) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 78a5833b7003..fe8c19814f1d 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -436,7 +436,7 @@ xfs_map_blocks( isnullstartblock(imap.br_startblock)) goto allocate_blocks;
- xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0); + xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, XFS_WPC(wpc)->data_seq); trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap); return 0; allocate_blocks: diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index b13aa16adf58..892a9ea714ab 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -49,12 +49,44 @@ xfs_alert_fsblock_zero( return -EFSCORRUPTED; }
+u64 +xfs_iomap_inode_sequence( + struct xfs_inode *ip, + u16 iomap_flags) +{ + u64 cookie = 0; + + if (iomap_flags & IOMAP_F_XATTR) + return READ_ONCE(ip->i_af.if_seq); + if ((iomap_flags & IOMAP_F_SHARED) && ip->i_cowfp) + cookie = (u64)READ_ONCE(ip->i_cowfp->if_seq) << 32; + return cookie | READ_ONCE(ip->i_df.if_seq); +} + +/* + * Check that the iomap passed to us is still valid for the given offset and + * length. + */ +static bool +xfs_iomap_valid( + struct inode *inode, + const struct iomap *iomap) +{ + return iomap->validity_cookie == + xfs_iomap_inode_sequence(XFS_I(inode), iomap->flags); +} + +const struct iomap_page_ops xfs_iomap_page_ops = { + .iomap_valid = xfs_iomap_valid, +}; + int xfs_bmbt_to_iomap( struct xfs_inode *ip, struct iomap *iomap, struct xfs_bmbt_irec *imap, - u16 flags) + u16 flags, + u64 sequence_cookie) { struct xfs_mount *mp = ip->i_mount; struct xfs_buftarg *target = xfs_inode_buftarg(ip); @@ -85,6 +117,9 @@ xfs_bmbt_to_iomap( if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) iomap->flags |= IOMAP_F_DIRTY; + + iomap->validity_cookie = sequence_cookie; + iomap->page_ops = &xfs_iomap_page_ops; return 0; }
@@ -188,7 +223,8 @@ xfs_iomap_write_direct( struct xfs_inode *ip, xfs_fileoff_t offset_fsb, xfs_fileoff_t count_fsb, - struct xfs_bmbt_irec *imap) + struct xfs_bmbt_irec *imap, + u64 *seq) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; @@ -276,6 +312,7 @@ xfs_iomap_write_direct( error = xfs_alert_fsblock_zero(ip, imap);
out_unlock: + *seq = xfs_iomap_inode_sequence(ip, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error;
@@ -731,6 +768,7 @@ xfs_direct_write_iomap_begin( bool shared = false; u16 iomap_flags = 0; unsigned lockmode; + u64 seq;
ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO));
@@ -785,9 +823,10 @@ xfs_direct_write_iomap_begin( goto out_unlock; }
+ seq = xfs_iomap_inode_sequence(ip, iomap_flags); xfs_iunlock(ip, lockmode); trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags); + return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags, seq);
allocate_blocks: error = -EAGAIN; @@ -813,23 +852,25 @@ xfs_direct_write_iomap_begin( xfs_iunlock(ip, lockmode);
error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb, - &imap); + &imap, &seq); if (error) return error;
trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags | IOMAP_F_NEW); + return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags | IOMAP_F_NEW, seq);
out_found_cow: - xfs_iunlock(ip, lockmode); length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount); trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap); if (imap.br_startblock != HOLESTARTBLOCK) { - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0); + seq = xfs_iomap_inode_sequence(ip, 0); + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0, seq); if (error) - return error; + goto out_unlock; } - return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); + xfs_iunlock(ip, lockmode); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED, seq);
out_unlock: if (lockmode) @@ -860,6 +901,7 @@ xfs_buffered_write_iomap_begin( bool eof = false, cow_eof = false, shared = false; int allocfork = XFS_DATA_FORK; int error = 0; + u64 seq;
if (xfs_is_shutdown(mp)) return -EIO; @@ -1039,25 +1081,30 @@ xfs_buffered_write_iomap_begin( * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch * them out if the write happens to fail. */ + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW); xfs_iunlock(ip, XFS_ILOCK_EXCL); trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, IOMAP_F_NEW); + return xfs_bmbt_to_iomap(ip, iomap, &imap, IOMAP_F_NEW, seq);
found_imap: + seq = xfs_iomap_inode_sequence(ip, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); - return xfs_bmbt_to_iomap(ip, iomap, &imap, 0); + return xfs_bmbt_to_iomap(ip, iomap, &imap, 0, seq);
found_cow: - xfs_iunlock(ip, XFS_ILOCK_EXCL); + seq = xfs_iomap_inode_sequence(ip, 0); if (imap.br_startoff <= offset_fsb) { - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0); + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0, seq); if (error) - return error; - return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); + goto out_unlock; + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED, seq); }
xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); - return xfs_bmbt_to_iomap(ip, iomap, &cmap, 0); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, 0, seq);
out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -1157,6 +1204,7 @@ xfs_read_iomap_begin( int nimaps = 1, error = 0; bool shared = false; unsigned lockmode; + u64 seq;
ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO)));
@@ -1170,12 +1218,14 @@ xfs_read_iomap_begin( &nimaps, 0); if (!error && (flags & IOMAP_REPORT)) error = xfs_reflink_trim_around_shared(ip, &imap, &shared); + seq = xfs_iomap_inode_sequence(ip, shared ? IOMAP_F_SHARED : 0); xfs_iunlock(ip, lockmode);
if (error) return error; trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, shared ? IOMAP_F_SHARED : 0); + return xfs_bmbt_to_iomap(ip, iomap, &imap, + shared ? IOMAP_F_SHARED : 0, seq); }
const struct iomap_ops xfs_read_iomap_ops = { @@ -1200,6 +1250,7 @@ xfs_seek_iomap_begin( struct xfs_bmbt_irec imap, cmap; int error = 0; unsigned lockmode; + u64 seq;
if (xfs_is_shutdown(mp)) return -EIO; @@ -1236,7 +1287,8 @@ xfs_seek_iomap_begin( if (data_fsb < cow_fsb + cmap.br_blockcount) end_fsb = min(end_fsb, data_fsb); xfs_trim_extent(&cmap, offset_fsb, end_fsb); - error = xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); + error = xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED, seq); /* * This is a COW extent, so we must probe the page cache * because there could be dirty page cache being backed @@ -1257,8 +1309,9 @@ xfs_seek_iomap_begin( imap.br_startblock = HOLESTARTBLOCK; imap.br_state = XFS_EXT_NORM; done: + seq = xfs_iomap_inode_sequence(ip, 0); xfs_trim_extent(&imap, offset_fsb, end_fsb); - error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, seq); out_unlock: xfs_iunlock(ip, lockmode); return error; @@ -1284,6 +1337,7 @@ xfs_xattr_iomap_begin( struct xfs_bmbt_irec imap; int nimaps = 1, error = 0; unsigned lockmode; + int seq;
if (xfs_is_shutdown(mp)) return -EIO; @@ -1300,12 +1354,14 @@ xfs_xattr_iomap_begin( error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, XFS_BMAPI_ATTRFORK); out_unlock: + + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_XATTR); xfs_iunlock(ip, lockmode);
if (error) return error; ASSERT(nimaps); - return xfs_bmbt_to_iomap(ip, iomap, &imap, 0); + return xfs_bmbt_to_iomap(ip, iomap, &imap, 0, seq); }
const struct iomap_ops xfs_xattr_iomap_ops = { diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 3626c9894bdb..ca42c0da9518 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -12,13 +12,15 @@ struct xfs_inode; struct xfs_bmbt_irec;
int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb, - xfs_fileoff_t count_fsb, struct xfs_bmbt_irec *imap); + xfs_fileoff_t count_fsb, struct xfs_bmbt_irec *imap, + u64 *sequence); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip, xfs_fileoff_t end_fsb);
+u64 xfs_iomap_inode_sequence(struct xfs_inode *ip, u16 iomap_flags); int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, - struct xfs_bmbt_irec *, u16); + struct xfs_bmbt_irec *, u16, u64 sequence_cookie);
static inline xfs_filblks_t xfs_aligned_fsb_count( diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 4ca25193925a..2876b1808e33 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -91,6 +91,7 @@ xfs_fs_map_blocks( int nimaps = 1; uint lock_flags; int error = 0; + u64 seq;
if (xfs_is_shutdown(mp)) return -EIO; @@ -142,6 +143,7 @@ xfs_fs_map_blocks( lock_flags = xfs_ilock_data_map_shared(ip); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, bmapi_flags); + seq = xfs_iomap_inode_sequence(ip, 0);
ASSERT(!nimaps || imap.br_startblock != DELAYSTARTBLOCK);
@@ -155,7 +157,7 @@ xfs_fs_map_blocks( xfs_iunlock(ip, lock_flags);
error = xfs_iomap_write_direct(ip, offset_fsb, - end_fsb - offset_fsb, &imap); + end_fsb - offset_fsb, &imap, &seq); if (error) goto out_unlock;
@@ -175,7 +177,7 @@ xfs_fs_map_blocks( } xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, seq); *device_generation = mp->m_generation; return error; out_unlock:
From: Ye Bin yebin10@huawei.com
Offering: HULK hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK
--------------------------------
There's issue as follows: XFS: Assertion failed: (bmv->bmv_iflags & BMV_IF_DELALLOC) != 0, file: fs/xfs/xfs_bmap_util.c, line: 329 ------------[ cut here ]------------ kernel BUG at fs/xfs/xfs_message.c:102! invalid opcode: 0000 [#1] PREEMPT SMP KASAN RIP: 0010:assfail+0x96/0xa0 RSP: 0018:ffffc9000fa178c0 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000001 RCX: ffff888179a18000 RDX: 0000000000000000 RSI: ffff888179a18000 RDI: 0000000000000002 RBP: 0000000000000000 R08: ffffffff8321aab6 R09: 0000000000000000 R10: 0000000000000001 R11: ffffed1105f85139 R12: ffffffff8aacc4c0 R13: 0000000000000149 R14: ffff888269f58000 R15: 000000000000000c CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000b92388 CR3: 000000024f006000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: <TASK> xfs_getbmap+0x1a5b/0x1e40 xfs_ioc_getbmap+0x1fd/0x5b0 xfs_file_ioctl+0x2cb/0x1d50 __x64_sys_ioctl+0x197/0x210 do_syscall_64+0x39/0xb0 entry_SYSCALL_64_after_hwframe+0x63/0xcd
Above issue may happen as follows: ThreadA ThreadB do_shared_fault __do_fault xfs_filemap_fault __xfs_filemap_fault filemap_fault xfs_ioc_getbmap -> Without BMV_IF_DELALLOC flag xfs_getbmap xfs_ilock(ip, XFS_IOLOCK_SHARED); filemap_write_and_wait do_page_mkwrite xfs_filemap_page_mkwrite __xfs_filemap_fault xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); iomap_page_mkwrite ... xfs_buffered_write_iomap_begin xfs_bmapi_reserve_delalloc -> Allocate delay extent xfs_ilock_data_map_shared(ip) xfs_getbmap_report_one ASSERT((bmv->bmv_iflags & BMV_IF_DELALLOC) != 0) -> trigger BUG_ON
As xfs_filemap_page_mkwrite() only hold XFS_MMAPLOCK_SHARED lock, there's small window mkwrite can produce delay extent after file write in xfs_getbmap(). To solve above issue, just skip delalloc extents.
Signed-off-by: Ye Bin yebin10@huawei.com Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/xfs_bmap_util.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-)
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index b02b4f0a151d..4433368eb6d9 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -272,15 +272,13 @@ xfs_getbmap_report_one( if (isnullstartblock(got->br_startblock) || got->br_startblock == DELAYSTARTBLOCK) { /* - * Delalloc extents that start beyond EOF can occur due to - * speculative EOF allocation when the delalloc extent is larger - * than the largest freespace extent at conversion time. These - * extents cannot be converted by data writeback, so can exist - * here even if we are not supposed to be finding delalloc - * extents. + * Take the flush completion as being a point-in-time snapshot + * where there are no delalloc extents, and if any new ones + * have been created racily, just skip them as being 'after' + * the flush and so don't get reported. */ - if (got->br_startoff < XFS_B_TO_FSB(ip->i_mount, XFS_ISIZE(ip))) - ASSERT((bmv->bmv_iflags & BMV_IF_DELALLOC) != 0); + if (!(bmv->bmv_iflags & BMV_IF_DELALLOC)) + return 0;
p->bmv_oflags |= BMV_OF_DELALLOC; p->bmv_block = -2;
From: "Darrick J. Wong" djwong@kernel.org
maillist inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK
Reference: https://patchwork.kernel.org/project/xfs/patch/20230501212434.GM59213@frogsf...
--------------------------------
In commit 8ee81ed581ff, Ye Bin complained about an ASSERT in the bmapx code that trips if we encounter a delalloc extent after flushing the pagecache to disk. The ioctl code does not hold MMAPLOCK so it's entirely possible that a racing write page fault can create a delalloc extent after the file has been flushed. The proposed solution was to replace the assertion with an early return that avoids filling out the bmap recordset with a delalloc entry if the caller didn't ask for it.
At the time, I recall thinking that the forward logic sounded ok, but felt hesitant because I suspected that changing this code would cause something /else/ to burst loose due to some other subtlety.
syzbot of course found that subtlety. If all the extent mappings found after the flush are delalloc mappings, we'll reach the end of the data fork without ever incrementing bmv->bmv_entries. This is new, since before we'd have emitted the delalloc mappings even though the caller didn't ask for them. Once we reach the end, we'll try to set BMV_OF_LAST on the -1st entry (because bmv_entries is zero) and go corrupt something else in memory. Yay.
I really dislike all these stupid patches that fiddle around with debug code and break things that otherwise worked well enough. Nobody was complaining that calling XFS_IOC_BMAPX without BMV_IF_DELALLOC would return BMV_OF_DELALLOC records, and now we've gone from "weird behavior that nobody cared about" to "bad behavior that must be addressed immediately".
Reported-by: syzbot+c103d3808a0de5faaf80@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-xfs/20230412024907.GP360889@frogsfrogsfrogs/ Fixes: 8ee81ed581ff ("xfs: fix BUG_ON in xfs_getbmap()") Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Ye Bin yebin10@huawei.com Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/xfs_bmap_util.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 4433368eb6d9..85d3804ebbe7 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -519,7 +519,9 @@ xfs_getbmap( if (!xfs_iext_next_extent(ifp, &icur, &got)) { xfs_fileoff_t end = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
- out[bmv->bmv_entries - 1].bmv_oflags |= BMV_OF_LAST; + if (bmv->bmv_entries > 0) + out[bmv->bmv_entries - 1].bmv_oflags |= + BMV_OF_LAST;
if (whichfork != XFS_ATTR_FORK && bno < end && !xfs_getbmap_full(bmv)) {
From: Guo Xuenan guoxuenan@huawei.com
Offering: HULK hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I72VMY
--------------------------------
I observed the following evidence of a leak while xfs_inactive failed. Especially in Debug mode, when xfs_attr_inactive failed, current exception path handling rudely clear inode attr fork, and if the inode is recycled then assertion will accur, if not, which may also lead to memory leak.
Since xfs_attr_inactive is supposed to clean up the attribute fork when the inode is being freed. While it removes the in-memory attribute fork even in the event of truncate attribute fork extents failure, then some attr data may left in memory and never be released. By avoiding blukstat ioctl concurrent access this inode, force shutdown xfs when this situation occurs.
The following script reliably replays the bug described above. ``` DISK=vdb MP=/mnt/$DISK DEV=/dev/$DISK nfiles=10 xattr_val="this is xattr value." while true do pidof fsstress | xargs kill -9 umount $MP df | grep $MP || break sleep 2 done
mkdir -p ${MP} && mkfs.xfs -f $DEV && mount $DEV $MP echo 0 > /sys/fs/xfs/$DISK/errortag/bmapifmt
cd $MP; touch $(seq 1 $nfiles); cd $OLDPWD for n in `seq 1 $nfiles`; do for j in `seq 1 20`; do setfattr -n user.${j} -v "$xattr_val" $MP/$n done done fsstress -d $MP -z -f bulkstat=200 -S c -l 1000 -p 8 & /usr/bin/rm $MP/* echo 3 > /sys/fs/xfs/$DISK/errortag/bmapifmt ```
Assertion in the kernel log as follows:
XFS (vdb): Mounting V5 Filesystem bd1b6c38-599a-43b3-8194-a584bebec4ca XFS (vdb): Ending clean mount xfs filesystem being mounted at /mnt/vdb supports timestamps until 2038 (0x7fffffff) XFS (vdb): Injecting error (false) at file fs/xfs/libxfs/xfs_bmap.c, line 3887, on filesystem "vdb" XFS: Assertion failed: ip->i_nblocks == 0, file: fs/xfs/xfs_inode.c, line: 2277 ------------[ cut here ]------------ kernel BUG at fs/xfs/xfs_message.c:102! invalid opcode: 0000 [#1] PREEMPT SMP KASAN CPU: 3 PID: 74 Comm: kworker/3:1 Not tainted 6.3.0-rc6-00127-g71deb8a5658c #569 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-4.fc34 04/01/2014 Workqueue: xfs-inodegc/vdb xfs_inodegc_worker RIP: 0010:assfail+0x8c/0x90 Code: 80 3d 37 27 3b 0a 00 75 1c e8 a0 b0 20 ff 0f 0b 5b 5d 41 5c 41 5d c3 48 c7 c7 30 25 64 8c e8 fb d8 66 ff eb db e8 84 b0 20 ff <0f> 0b 66 90 0f 1f 44 00 00 55 48 89 fd 53 48 63 de e8 6e b0 20 ff RSP: 0018:ffff888101b17b20 EFLAGS: 00010293 RAX: 0000000000000000 RBX: ffffffff8444eea0 RCX: 0000000000000000 RDX: ffff888101b08040 RSI: ffffffff8228fe1c RDI: ffffffff844510c0 RBP: 0000000000000000 R08: 0000000000000001 R09: ffff888101b177ff R10: ffffed1020362eff R11: 0000000000000001 R12: ffffffff8444f720 R13: 00000000000008e5 R14: ffff888155279800 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff8883edd80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000001612e18 CR3: 000000017bab5005 CR4: 0000000000770ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: <TASK> xfs_ifree+0xea6/0x1310 xfs_inactive_ifree.isra.0+0x1ab/0x460 xfs_inactive+0x41f/0x710 xfs_inodegc_worker+0x22e/0x500 process_one_work+0x6d1/0xfe0 worker_thread+0x5b9/0xf60 kthread+0x287/0x330 ret_from_fork+0x1f/0x30 </TASK> Modules linked in: ---[ end trace 0000000000000000 ]--- RIP: 0010:assfail+0x8c/0x90 Code: 80 3d 37 27 3b 0a 00 75 1c e8 a0 b0 20 ff 0f 0b 5b 5d 41 5c 41 5d c3 48 c7 c7 30 25 64 8c e8 fb d8 66 ff eb db e8 84 b0 20 ff <0f> 0b 66 90 0f 1f 44 00 00 55 48 89 fd 53 48 63 de e8 6e b0 20 ff RSP: 0018:ffff888101b17b20 EFLAGS: 00010293 RAX: 0000000000000000 RBX: ffffffff8444eea0 RCX: 0000000000000000 RDX: ffff888101b08040 RSI: ffffffff8228fe1c RDI: ffffffff844510c0 RBP: 0000000000000000 R08: 0000000000000001 R09: ffff888101b177ff R10: ffffed1020362eff R11: 0000000000000001 R12: ffffffff8444f720 R13: 00000000000008e5 R14: ffff888155279800 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff8883edd80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000001612e18 CR3: 000000017bab5005 CR4: 0000000000770ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554
Fixes: 6dfe5a049f2d ("xfs: xfs_attr_inactive leaves inconsistent attr fork state behind") Signed-off-by: Guo Xuenan guoxuenan@huawei.com Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/xfs_attr_inactive.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index d9c7ff1469c8..5993960d288f 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -367,7 +367,7 @@ xfs_attr_inactive( if (dp->i_af.if_nextents > 0) { error = xfs_attr3_root_inactive(&trans, dp); if (error) - goto out_cancel; + goto out_shutdown;
error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0); if (error) @@ -381,6 +381,8 @@ xfs_attr_inactive( xfs_iunlock(dp, lock_mode); return error;
+out_shutdown: + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); out_cancel: xfs_trans_cancel(trans); out_destroy_fork:
From: Gao Xiang hsiangkao@linux.alibaba.com
Offering: HULK maillist inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7381S
--------------------------------
There is a long standing issue which could cause fs shutdown due to inode extent-to-btree conversion failure right after an extent allocation in the same AG, which is absolutely unexpected due to the proper minleft reservation in the previous allocation. Brian once addressed one of the root cause [1], however, such symptom can still occur after the commit is merged as reported [2], and our cloud environment is also suffering from this issue.
From the description of the commit [1], I found that Zirong has an
in-house stress test reproducer for this issue, therefore I asked him to reproduce again and he confirmed that such issue can still be reproduced on RHEL 9 in several days.
Thanks to him, after adding some debugging code to dump the current transaction log items, I think the root cause is as below:
1. xfs_bmapi_allocate() with the following condition: freeblks: 18304 pagf_flcount: 6 reservation: 18276 need (min_free): 6 args->minleft: 1 available = freeblks + agflcount - reservation - need - minleft = 18304 + min(6, 6) - 18276 - 6 - 1 = 27 The first allocation check itself is ok, and args->maxlen = 27 here
At this time, AG 3 also has the following state: 1st:64 last:69 cnt:6 longest:6395
AGFL has the following state: 64:547 65:167 66:1651 67:2040807 68:783 69:604
2. Tried to get 27 blocks from this AG, but in order to finish such allocation, it had to need a new btree block for cntbt (so take another free block from agfl). It can be seen with a new AGF recorded in the transaction: blkno 62914177, len 1, map_size 1 00000000: 58 41 47 46 00 00 00 01 00 00 00 03 00 27 ff f0 XAGF.........'.. 00000010: 00 00 00 09 00 00 00 07 00 00 00 00 00 00 00 02 ................ 00000020: 00 00 00 02 00 00 00 00 00 00 00 41 00 00 00 45 ...........A...E 00000030: 00 00 00 05 00 00 47 65 00 00 18 fb 00 00 00 09 ......Ge........ 00000040: 75 dc c1 b5 1a 45 40 2a 80 50 72 f0 59 6e 62 66 u....E@*.Pr.Ynbf
It can be parsed as: agf 3 flfirst: 65 (0x41) fllast: 69 (0x45) cnt: 5 freeblks 18277
3. agfl 64 (agbno 547, daddr 62918552) was then written as a cntbt block, which can also be seen in a log item as below: type#011= 0x123c flags#011= 0x8 blkno 62918552, len 8, map_size 1 00000000: 41 42 33 43 00 00 00 fd 00 1f 23 e4 ff ff ff ff AB3C......#..... 00000010: 00 00 00 00 03 c0 0f 98 00 00 00 00 00 00 00 00 ................ 00000020: 75 dc c1 b5 1a 45 40 2a 80 50 72 f0 59 6e 62 66 u....E@*.Pr.Ynbf ...
4. Finally, the following inode extent to btree allocation fails as below: kernel: ------------[ cut here ]------------ WARNING: CPU: 15 PID: 49290 at fs/xfs/libxfs/xfs_bmap.c:717 xfs_bmap_extents_to_btree+0xc51/0x1050 [xfs] ... XFS (sda2): agno 3 agflcount 5 freeblks 18277 reservation 18276 6
since freeblks = 18304 - 27 = 18277, but with another agfl block allocated (pagf_flcount from 6 to 5), the inequality will not be satisfied:
available = freeblks + agflcount - reservation - need - minleft = 18277 + min(5, 6) - 18276 - 6 - 0 = 0 < 1
Full current transaction log item dump can be fetched from [3].
As a short-term solution, the following allocations (e.g. allocation for inode extent-to-btree conversion) can be recorded in order to count more blocks to reserve for safely freespace btree splits so that it will shorten available and args->maxlen to available = freeblks + agflcount - reservation - need - minleft = 18304 + min(6, 6) - 18276 - 6*2 - 1 = 21 args->maxlen = 21 in the first allocation, and the following conversion should then succeed. At least, it's easy to be backported and do hotfix.
In the long term, args->total and args->minleft have be revisited although it could cause more refactoring.
[1] commit 1ca89fbc48e1 ("xfs: don't account extra agfl blocks as available") https://lore.kernel.org/r/20190327145000.10756-1-bfoster@redhat.com [2] https://lore.kernel.org/r/20220105071052.GD20464@templeofstupid.com [3] https://lore.kernel.org/linux-xfs/Y2RevDyoeJZSpiat@B-P7TQMD6M-0146.local/2-d... Reported-by: Zirong Lang zlang@redhat.com Signed-off-by: Gao Xiang hsiangkao@linux.alibaba.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/libxfs/xfs_alloc.c | 9 +++++++-- fs/xfs/libxfs/xfs_alloc.h | 1 + fs/xfs/libxfs/xfs_bmap.c | 1 + fs/xfs/libxfs/xfs_ialloc.c | 1 + 4 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 93e8d8d05a69..60aab422e818 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2516,7 +2516,12 @@ xfs_alloc_fix_freelist( goto out_agbp_relse; }
- need = xfs_alloc_min_freelist(mp, pag); + /* + * Also need to fulfill freespace btree splits by reservaing more + * blocks to perform multiple allocations from a single AG and + * transaction if needed. + */ + need = xfs_alloc_min_freelist(mp, pag) * (1 + args->postallocs); if (!xfs_alloc_space_available(args, need, flags | XFS_ALLOC_FLAG_CHECK)) goto out_agbp_relse; @@ -2540,7 +2545,7 @@ xfs_alloc_fix_freelist( xfs_agfl_reset(tp, agbp, pag);
/* If there isn't enough total space or single-extent, reject it. */ - need = xfs_alloc_min_freelist(mp, pag); + need = xfs_alloc_min_freelist(mp, pag) * (1 + args->postallocs); if (!xfs_alloc_space_available(args, need, flags)) goto out_agbp_relse;
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index c147194aa338..9cf9b4b593ca 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -73,6 +73,7 @@ typedef struct xfs_alloc_arg { int datatype; /* mask defining data type treatment */ char wasdel; /* set if allocation was prev delayed */ char wasfromfl; /* set if allocation is from freelist */ + bool postallocs; /* number of post-allocations */ struct xfs_owner_info oinfo; /* owner of blocks being allocated */ enum xfs_ag_resv_type resv; /* block reservation to use */ } xfs_alloc_arg_t; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index c141eb71575f..34fe4aed0ba8 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3626,6 +3626,7 @@ xfs_bmap_btalloc( args.alignment = 1; args.minalignslop = 0; } + args.postallocs = 1; args.minleft = ap->minleft; args.wasdel = ap->wasdel; args.resv = XFS_AG_RESV_NONE; diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 6c1c74497b90..b8b7f4d14329 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -630,6 +630,7 @@ xfs_ialloc_ag_alloc( int do_sparse = 0;
memset(&args, 0, sizeof(args)); + args.postallocs = 1; args.tp = tp; args.mp = tp->t_mountp; args.fsbno = NULLFSBLOCK;
Offering: HULK hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I73AXQ
--------------------------------
syzkaller found a UAF:
================================================================== BUG: KASAN: use-after-free in __rb_erase_augmented include/linux/rbtree_augmented.h:225 [inline] BUG: KASAN: use-after-free in rb_erase+0x16e/0x690 lib/rbtree.c:443 Write of size 8 at addr ffff888101990a40 by task kworker/1:1H/114 CPU: 1 PID: 114 Comm: kworker/1:1H Not tainted 5.10.0-00734-gc980ff0a1f18-dirty #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 Workqueue: xfs-log/sda xlog_ioend_work Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0xbe/0xfd lib/dump_stack.c:118 kasan_report+0x3a/0x50 mm/kasan/report.c:559 __rb_erase_augmented include/linux/rbtree_augmented.h:225 [inline] rb_erase+0x16e/0x690 lib/rbtree.c:443 xfs_extent_busy_clear_one+0x5a/0x1c0 fs/xfs/xfs_extent_busy.c:517 xfs_extent_busy_clear+0x18b/0x1d0 fs/xfs/xfs_extent_busy.c:569 xlog_cil_committed+0x12a/0x370 fs/xfs/xfs_log_cil.c:659 xlog_cil_process_committed+0xbc/0xe0 fs/xfs/xfs_log_cil.c:683 xlog_state_do_iclog_callbacks+0x30c/0x4b0 fs/xfs/xfs_log.c:2777 xlog_state_do_callback+0x99/0x150 fs/xfs/xfs_log.c:2802 xlog_ioend_work+0x57/0xc0 fs/xfs/xfs_log.c:1308 process_one_work+0x406/0x810 kernel/workqueue.c:2280 worker_thread+0x96/0x720 kernel/workqueue.c:2426 kthread+0x1f4/0x250 kernel/kthread.c:313 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:299
Allocated by task 22679: kasan_save_stack+0x1b/0x40 mm/kasan/common.c:48 kasan_set_track mm/kasan/common.c:56 [inline] set_alloc_info mm/kasan/common.c:498 [inline] __kasan_kmalloc mm/kasan/common.c:530 [inline] __kasan_kmalloc.constprop.0+0xf0/0x130 mm/kasan/common.c:501 kmalloc include/linux/slab.h:568 [inline] kmem_alloc+0xc2/0x230 fs/xfs/kmem.c:21 kmem_zalloc fs/xfs/kmem.h:69 [inline] xfs_extent_busy_insert+0x3c/0x370 fs/xfs/xfs_extent_busy.c:36 __xfs_free_extent+0x268/0x340 fs/xfs/libxfs/xfs_alloc.c:3327 xfs_free_extent fs/xfs/libxfs/xfs_alloc.h:183 [inline] xfs_ag_extend_space+0x26e/0x280 fs/xfs/libxfs/xfs_ag.c:540 xfs_growfs_data_private.isra.0+0x64e/0x6f0 fs/xfs/xfs_fsops.c:112 xfs_growfs_data+0x287/0x360 fs/xfs/xfs_fsops.c:239 xfs_file_ioctl+0x9f2/0x1320 fs/xfs/xfs_ioctl.c:2274 vfs_ioctl fs/ioctl.c:48 [inline] __do_sys_ioctl fs/ioctl.c:753 [inline] __se_sys_ioctl+0x111/0x160 fs/ioctl.c:739 do_syscall_64+0x30/0x40 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x61/0xc6
Freed by task 114: kasan_save_stack+0x1b/0x40 mm/kasan/common.c:48 kasan_set_track+0x1c/0x30 mm/kasan/common.c:56 kasan_set_free_info+0x20/0x40 mm/kasan/generic.c:361 __kasan_slab_free.part.0+0x13f/0x1b0 mm/kasan/common.c:482 slab_free_hook mm/slub.c:1569 [inline] slab_free_freelist_hook mm/slub.c:1608 [inline] slab_free mm/slub.c:3179 [inline] kfree+0xce/0x860 mm/slub.c:4176 kvfree+0x47/0x50 mm/util.c:647 xfs_extent_busy_clear+0x18b/0x1d0 fs/xfs/xfs_extent_busy.c:569 xlog_cil_committed+0x12a/0x370 fs/xfs/xfs_log_cil.c:659 xlog_cil_process_committed+0xbc/0xe0 fs/xfs/xfs_log_cil.c:683 xlog_state_do_iclog_callbacks+0x30c/0x4b0 fs/xfs/xfs_log.c:2777 xlog_state_do_callback+0x99/0x150 fs/xfs/xfs_log.c:2802 xlog_ioend_work+0x57/0xc0 fs/xfs/xfs_log.c:1308 process_one_work+0x406/0x810 kernel/workqueue.c:2280 worker_thread+0x96/0x720 kernel/workqueue.c:2426 kthread+0x1f4/0x250 kernel/kthread.c:313 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:299
The buggy address belongs to the object at ffff888101990a40 which belongs to the cache kmalloc-64 of size 64 The buggy address is located 0 bytes inside of 64-byte region [ffff888101990a40, ffff888101990a80) The buggy address belongs to the page: page:ffffea0004066400 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x101990 head:ffffea0004066400 order:1 compound_mapcount:0 flags: 0x17ffffc0010200(slab|head|node=0|zone=2|lastcpupid=0x1fffff) raw: 0017ffffc0010200 ffffea00009d5d88 ffff888100040a70 ffff88810004d500 raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected
Memory state around the buggy address: ffff888101990900: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff888101990980: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
ffff888101990a00: fc fc fc fc fc fc fc fc fa fb fb fb fb fb fb fb
^ ffff888101990a80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff888101990b00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ==================================================================
The bug can be reproduced with the following sequence:
# truncate -s 1073741824 xfs_test.img # mkfs.xfs -f -b size=1024 -d agcount=4 xfs_test.img # truncate -s 2305843009213693952 xfs_test.img # mount -o loop xfs_test.img /mnt/test # fsstress -d /mnt/test -l 0 -n 10000 >/dev/null & # xfs_growfs -D 1125899907891200 /mnt/test
The root cause is that during growfs, user space passed in a large value of newblcoks to xfs_growfs_data_private(), due to current sb_agblocks is too small, new AG count will exceed UINT_MAX. Because of AG number type is unsigned int and it would overflow, that caused nagcount much smaller than the actual value and new blocks in the old last AG very large. When old last AG expand the space, xfs_extlen_t type is unsigned int, it would overflow again, if new blocks exceed UINT_MAX and the lower 32 bit are zero. This will cause busy extent whose length is equal to zero insert into rbtree, xfs_extent_busy_clear_one() will free it abormally but not remove it from tbree, UAF will be triggered when access rbtree on the next time. Fix it by add checks for nagcount overflow inxfs_growfs_data_private.
Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/xfs_fsops.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 1430b2b9c466..e81e052dcb75 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -54,6 +54,9 @@ xfs_growfs_data_private( new = nb; /* use new as a temporary here */ nb_mod = do_div(new, mp->m_sb.sb_agblocks); nagcount = new + (nb_mod != 0); + /* check for overflow */ + if (nagcount < new) + return -EINVAL; if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) { nagcount--; nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
From: Ye Bin yebin10@huawei.com
Offering: HULK hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK
--------------------------------
When do IO fault injection, mount maybe hung: blk_update_request: I/O error, dev dm-4, sector 2128216 op 0x0:(READ) flags 0x1000 phys_seg 1 prio class 0 XFS (dm-4): metadata I/O error in "xfs_btree_read_buf_block.constprop.0+0x190/0x200 [xfs]" at daddr 0x207958 len 8 error 5 blk_update_request: I/O error, dev dm-4, sector 2108042 op 0x1:(WRITE) flags 0x29800 phys_seg 1 prio class 0 XFS (dm-4): log I/O error -5 XFS (dm-4): Metadata I/O Error (0x1) detected at xfs_trans_read_buf_map+0x2b6/0x510 [xfs] (fs/xfs/xfs_trans_buf.c:296). Shutting down filesystem. sd 6:0:0:3: [sdh] Synchronizing SCSI cache XFS (dm-4): Please unmount the filesystem and rectify the problem(s) XFS (dm-4): Failed to recover intents XFS (dm-4): Ending recovery (logdev: internal)
PID: 2489297 TASK: ffff8880355c1b00 CPU: 0 COMMAND: "mount" __schedule at ffffffff93aa03c1 schedule at ffffffff93aa0c6f schedule_timeout at ffffffff93aa63c0 xfs_wait_buftarg at ffffffffc1170ff0 [xfs] xfs_log_mount_finish at ffffffffc11bddc4 [xfs] xfs_mountfs at ffffffffc11a4492 [xfs] xfs_fc_fill_super at ffffffffc11ae01c [xfs] get_tree_bdev at ffffffff92c62a79 vfs_get_tree at ffffffff92c60fe0 do_new_mount at ffffffff92caaca0 path_mount at ffffffff92cabf83 __se_sys_mount at ffffffff92cac352 do_syscall_64 at ffffffff93a8b153 entry_SYSCALL_64_after_hwframe at ffffffff93c00099
Ftrace log: mount-2489297 [002] .... 337330.575879: xfs_buf_wait_buftarg: dev 253:4 bno 0x3220 nblks 0x8 hold 2 pincount 0 lock 1 flags DONE|PAGES caller __list_l0
Above issue hapnens as xfs_buf log item is in AIL list, but xlog is already shutdown, so xfs_log_worker() will not wakeup xfsaild to submit AIL list. Then the last 'b_hold' will no chance to be decreased. Then xfs_wait_buftarg() will dead loop to free xfs_buf. To solve above issue there is need to push AIL list before call xfs_wait_buftarg(). As xfs_log_mount_finish() return error, xfs_mountfs() will call xfs_log_mount_cancel() to clean AIL list, and call xfs_wait_buftarg() to make sure all xfs_buf has been reclaimed. So what we need to do is call xfs_wait_buftarg() when 'error == 0' in xfs_log_mount_finish().
Signed-off-by: Ye Bin yebin@huawei.com Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/xfs_log.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 506bb43b1eda..7220945cf816 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -821,7 +821,9 @@ xfs_log_mount_finish( } else { xfs_info(mp, "Ending clean mount"); } - xfs_wait_buftarg(mp->m_ddev_targp); + + if (!error) + xfs_wait_buftarg(mp->m_ddev_targp);
clear_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate); if (readonly)
Offering: HULK hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK
--------------------------------
After running unplug disk test and unmount filesystem, the umount thread hung all the time.
crash> dmesg sd 0:0:0:0: rejecting I/O to offline device XFS (sda): log I/O error -5 XFS (sda): Corruption of in-memory data (0x8) detected at xfs_defer_finish_noroll+0x12e0/0x1cf0 (fs/xfs/libxfs/xfs_defer.c:504). Shutting down filesystem. XFS (sda): Please unmount the filesystem and rectify the problem(s) XFS (sda): xfs_inactive_ifree: xfs_trans_commit returned error -5 XFS (sda): Unmounting Filesystem
crash> bt 3368 PID: 3368 TASK: ffff88801bcd8040 CPU: 3 COMMAND: "umount" #0 [ffffc900086a7ae0] __schedule at ffffffff83d3fd25 #1 [ffffc900086a7be8] schedule at ffffffff83d414dd #2 [ffffc900086a7c10] xfs_ail_push_all_sync at ffffffff8256db24 #3 [ffffc900086a7d18] xfs_unmount_flush_inodes at ffffffff824ee7e2 #4 [ffffc900086a7d28] xfs_unmountfs at ffffffff824f2eff #5 [ffffc900086a7da8] xfs_fs_put_super at ffffffff82503e69 #6 [ffffc900086a7de8] generic_shutdown_super at ffffffff81aeb8cd #7 [ffffc900086a7e10] kill_block_super at ffffffff81aefcfa #8 [ffffc900086a7e30] deactivate_locked_super at ffffffff81aeb2da #9 [ffffc900086a7e48] deactivate_super at ffffffff81aeb639 #10 [ffffc900086a7e68] cleanup_mnt at ffffffff81b6ddd5 #11 [ffffc900086a7ea0] __cleanup_mnt at ffffffff81b6dfdf #12 [ffffc900086a7eb0] task_work_run at ffffffff8126e5cf #13 [ffffc900086a7ef8] exit_to_user_mode_prepare at ffffffff813fa136 #14 [ffffc900086a7f28] syscall_exit_to_user_mode at ffffffff83d25dbb #15 [ffffc900086a7f40] do_syscall_64 at ffffffff83d1f8d9 #16 [ffffc900086a7f50] entry_SYSCALL_64_after_hwframe at ffffffff83e00085
When we free a cluster buffer from xfs_ifree_cluster, all the inodes in cache are marked XFS_ISTALE. On journal commit dirty stale inodes as are handled by both buffer and inode log items, inodes marked as XFS_ISTALE in AIL will be removed from the AIL because the buffer log item will clean it. If the transaction commit fails in the xfs_inactive_ifree(), inodes marked as XFS_ISTALE will be left in AIL due to buf log item is not committed, this will cause the unmount thread above to be blocked all the time. Set inode item abort associated with the buffer that is stale after buf item release, let ail clean up these items, that prevent inode item left in AIL and can not being pushed.
Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/xfs_buf_item.c | 20 ++++++++++++++++++++ fs/xfs/xfs_inode.c | 3 ++- fs/xfs/xfs_inode_item.c | 3 ++- 3 files changed, 24 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index c905cf6804d7..3a9e006b7220 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -564,8 +564,12 @@ xfs_buf_item_put( struct xfs_buf_log_item *bip) { struct xfs_log_item *lip = &bip->bli_item; + struct xfs_buf *bp = bip->bli_buf; + struct xfs_log_item *lp, *n; + struct xfs_inode_log_item *iip; bool aborted; bool dirty; + bool stale = bip->bli_flags & XFS_BLI_STALE_INODE;
/* drop the bli ref and return if it wasn't the last one */ if (!atomic_dec_and_test(&bip->bli_refcount)) @@ -592,6 +596,22 @@ xfs_buf_item_put( if (aborted) xfs_trans_ail_delete(lip, 0); xfs_buf_item_relse(bip->bli_buf); + + /* + * If it is an inode buffer and item marked as stale, abort flushing + * inodes associated with the buf, prevent inode item left in AIL. + */ + if (aborted && stale) { + list_for_each_entry_safe(lp, n, &bp->b_li_list, li_bio_list) { + iip = container_of(lp, struct xfs_inode_log_item, + ili_item); + if (xfs_iflags_test(iip->ili_inode, XFS_ISTALE)) { + set_bit(XFS_LI_ABORTED, &lp->li_flags); + xfs_iflags_clear(iip->ili_inode, XFS_IFLUSHING); + } + } + } + return true; }
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ea360c9c223d..7b777540a44c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3758,7 +3758,8 @@ xfs_iflush_cluster( * once we drop the i_flags_lock. */ spin_lock(&ip->i_flags_lock); - ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE)); + ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE) || + test_bit(XFS_LI_ABORTED, &lip->li_flags)); if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) { spin_unlock(&ip->i_flags_lock); continue; diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index e160a83e7e52..7586b19b322b 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -518,7 +518,8 @@ xfs_inode_item_push( uint rval = XFS_ITEM_SUCCESS; int error;
- if (!bp || (ip->i_flags & XFS_ISTALE)) { + if (!bp || ((ip->i_flags & XFS_ISTALE) && + !(lip->li_flags & XFS_LI_ABORTED))) { /* * Inode item/buffer is being being aborted due to cluster * buffer deletion. Trigger a log force to have that operation
From: "Darrick J. Wong" djwong@kernel.org
mainline inclusion from mainline-v6.0-rc7 commit 3d5f3ba1ac28059bdf7000cae2403e4e984308d2 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Every now and then I see this crash on arm64:
Unable to handle kernel NULL pointer dereference at virtual address 00000000000000f8 Buffer I/O error on dev dm-0, logical block 8733687, async page read Mem abort info: ESR = 0x0000000096000006 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x06: level 2 translation fault Data abort info: ISV = 0, ISS = 0x00000006 CM = 0, WnR = 0 user pgtable: 64k pages, 42-bit VAs, pgdp=0000000139750000 [00000000000000f8] pgd=0000000000000000, p4d=0000000000000000, pud=0000000000000000, pmd=0000000000000000 Internal error: Oops: 96000006 [#1] PREEMPT SMP Buffer I/O error on dev dm-0, logical block 8733688, async page read Dumping ftrace buffer: Buffer I/O error on dev dm-0, logical block 8733689, async page read (ftrace buffer empty) XFS (dm-0): log I/O error -5 Modules linked in: dm_thin_pool dm_persistent_data XFS (dm-0): Metadata I/O Error (0x1) detected at xfs_trans_read_buf_map+0x1ec/0x590 [xfs] (fs/xfs/xfs_trans_buf.c:296). dm_bio_prison XFS (dm-0): Please unmount the filesystem and rectify the problem(s) XFS (dm-0): xfs_imap_lookup: xfs_ialloc_read_agi() returned error -5, agno 0 dm_bufio dm_log_writes xfs nft_chain_nat xt_REDIRECT nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip6t_REJECT potentially unexpected fatal signal 6. nf_reject_ipv6 potentially unexpected fatal signal 6. ipt_REJECT nf_reject_ipv4 CPU: 1 PID: 122166 Comm: fsstress Tainted: G W 6.0.0-rc5-djwa #rc5 3004c9f1de887ebae86015f2677638ce51ee7 rpcsec_gss_krb5 auth_rpcgss xt_tcpudp ip_set_hash_ip ip_set_hash_net xt_set nft_compat ip_set_hash_mac ip_set nf_tables Hardware name: QEMU KVM Virtual Machine, BIOS 1.5.1 06/16/2021 pstate: 60001000 (nZCv daif -PAN -UAO -TCO -DIT +SSBS BTYPE=--) ip_tables pc : 000003fd6d7df200 x_tables lr : 000003fd6d7df1ec overlay nfsv4 CPU: 0 PID: 54031 Comm: u4:3 Tainted: G W 6.0.0-rc5-djwa #rc5 3004c9f1de887ebae86015f2677638ce51ee7405 Hardware name: QEMU KVM Virtual Machine, BIOS 1.5.1 06/16/2021 Workqueue: writeback wb_workfn sp : 000003ffd9522fd0 (flush-253:0) pstate: 60401005 (nZCv daif +PAN -UAO -TCO -DIT +SSBS BTYPE=--) pc : errseq_set+0x1c/0x100 x29: 000003ffd9522fd0 x28: 0000000000000023 x27: 000002acefeb6780 x26: 0000000000000005 x25: 0000000000000001 x24: 0000000000000000 x23: 00000000ffffffff x22: 0000000000000005 lr : __filemap_set_wb_err+0x24/0xe0 x21: 0000000000000006 sp : fffffe000f80f760 x29: fffffe000f80f760 x28: 0000000000000003 x27: fffffe000f80f9f8 x26: 0000000002523000 x25: 00000000fffffffb x24: fffffe000f80f868 x23: fffffe000f80fbb0 x22: fffffc0180c26a78 x21: 0000000002530000 x20: 0000000000000000 x19: 0000000000000000 x18: 0000000000000000
x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000 x14: 0000000000000001 x13: 0000000000470af3 x12: fffffc0058f70000 x11: 0000000000000040 x10: 0000000000001b20 x9 : fffffe000836b288 x8 : fffffc00eb9fd480 x7 : 0000000000f83659 x6 : 0000000000000000 x5 : 0000000000000869 x4 : 0000000000000005 x3 : 00000000000000f8 x20: 000003fd6d740020 x19: 000000000001dd36 x18: 0000000000000001 x17: 000003fd6d78704c x16: 0000000000000001 x15: 000002acfac87668 x2 : 0000000000000ffa x1 : 00000000fffffffb x0 : 00000000000000f8 Call trace: errseq_set+0x1c/0x100 __filemap_set_wb_err+0x24/0xe0 iomap_do_writepage+0x5e4/0xd5c write_cache_pages+0x208/0x674 iomap_writepages+0x34/0x60 xfs_vm_writepages+0x8c/0xcc [xfs 7a861f39c43631f15d3a5884246ba5035d4ca78b] x14: 0000000000000000 x13: 2064656e72757465 x12: 0000000000002180 x11: 000003fd6d8a82d0 x10: 0000000000000000 x9 : 000003fd6d8ae288 x8 : 0000000000000083 x7 : 00000000ffffffff x6 : 00000000ffffffee x5 : 00000000fbad2887 x4 : 000003fd6d9abb58 x3 : 000003fd6d740020 x2 : 0000000000000006 x1 : 000000000001dd36 x0 : 0000000000000000 CPU: 1 PID: 122167 Comm: fsstress Tainted: G W 6.0.0-rc5-djwa #rc5 3004c9f1de887ebae86015f2677638ce51ee7 do_writepages+0x90/0x1c4 __writeback_single_inode+0x4c/0x4ac Hardware name: QEMU KVM Virtual Machine, BIOS 1.5.1 06/16/2021 writeback_sb_inodes+0x214/0x4ac wb_writeback+0xf4/0x3b0 pstate: 60001000 (nZCv daif -PAN -UAO -TCO -DIT +SSBS BTYPE=--) wb_workfn+0xfc/0x580 process_one_work+0x1e8/0x480 pc : 000003fd6d7df200 worker_thread+0x78/0x430
This crash is a result of iomap_writepage_map encountering some sort of error during writeback and wanting to set that error code in the file mapping so that fsync will report it. Unfortunately, the code dereferences folio->mapping after unlocking the folio, which means that another thread could have removed the page from the page cache (writeback doesn't hold the invalidation lock) and give it to somebody else.
At best we crash the system like above; at worst, we corrupt memory or set an error on some other unsuspecting file while failing to record the problems with *this* file. Regardless, fix the problem by reporting the error to the inode mapping.
NOTE: Commit 598ecfbaa742 lifted the XFS writeback code to iomap, so this fix should be backported to XFS in the 4.6-5.4 kernels in addition to iomap in the 5.5-5.19 kernels.
Fixes: e735c0079465 ("iomap: Convert iomap_add_to_ioend() to take a folio") # 5.17 onward Fixes: 598ecfbaa742 ("iomap: lift the xfs writeback code to iomap") # 5.5-5.16, needs backporting Fixes: 150d5be09ce4 ("xfs: remove xfs_cancel_ioend") # 4.6-5.4, needs backporting Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Matthew Wilcox (Oracle) willy@infradead.org
conflicts: fs/iomap/buffered-io.c
Signed-off-by: Ye Bin yebin10@huawei.com Signed-off-by: Long Li leo.lilong@huawei.com --- fs/iomap/buffered-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 243de0dbc219..d9092aa5c3fb 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1489,7 +1489,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, if (!count) end_page_writeback(page); done: - mapping_set_error(page->mapping, error); + mapping_set_error(inode->i_mapping, error); return error; }
From: Wu Guanghao wuguanghao3@huawei.com
Offering: HULK hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I77NBB
--------------------------------
After testing xfs_growfs + fsstress + fault injection, the following stack appeared when mounting the filesystem:
[ 149.902032] XFS (loop0): xfs_buf_map_verify: daddr 0x200001 out of range, EOFS 0x200000 [ 149.902072] WARNING: CPU: 12 PID: 3045 at fs/xfs/xfs_buf.c:535 xfs_buf_get_map+0x5ae/0x650 [xfs] ... [ 149.902473] xfs_buf_read_map+0x59/0x330 [xfs] [ 149.902621] ? xlog_recover_items_pass2+0x55/0xd0 [xfs] [ 149.902809] xlog_recover_buf_commit_pass2+0xff/0x640 [xfs] [ 149.902959] ? xlog_recover_items_pass2+0x55/0xd0 [xfs] [ 149.903104] xlog_recover_items_pass2+0x55/0xd0 [xfs] [ 149.903247] xlog_recover_commit_trans+0x2e0/0x330 [xfs] [ 149.903390] xlog_recovery_process_trans+0x8e/0xf0 [xfs] [ 149.903531] xlog_recover_process_data+0x9c/0x130 [xfs] [ 149.903687] xlog_do_recovery_pass+0x3cc/0x5d0 [xfs] [ 149.903843] xlog_do_log_recovery+0x5c/0x80 [xfs] [ 149.903984] xlog_do_recover+0x33/0x1c0 [xfs] [ 149.904125] xlog_recover+0xdd/0x190 [xfs] [ 149.904265] xfs_log_mount+0x125/0x2f0 [xfs] [ 149.904410] xfs_mountfs+0x41a/0x910 [xfs] [ 149.904558] ? __pfx_xfs_fstrm_free_func+0x10/0x10 [xfs] [ 149.904725] xfs_fs_fill_super+0x4b7/0x940 [xfs] [ 149.904873] ? __pfx_xfs_fs_fill_super+0x10/0x10 [xfs] [ 149.905016] get_tree_bdev+0x19a/0x280 [ 149.905020] vfs_get_tree+0x29/0xd0 [ 149.905023] path_mount+0x69e/0x9b0 [ 149.905026] do_mount+0x7d/0xa0 [ 149.905029] __x64_sys_mount+0xdc/0x100 [ 149.905032] do_syscall_64+0x3e/0x90 [ 149.905035] entry_SYSCALL_64_after_hwframe+0x72/0xdc
The trigger process is as follows:
1. Growfs size from 0x200000 to 0x300000 2. Using the space range of 0x200000~0x300000 3. The above operations have only been written to the log area on disk 4. Fault injection and shutdown filesystem 5. Mount the filesystem and replay the log about growfs, but only modify the superblock buffer without modifying the mp->m_sb structure in memory 6. Continuing the log replay, at this point we are replaying operation 2, then it was discovered that the blocks used more than mp->m_sb.sb_dblocks
Therefore, during log replay, if there are any modifications made to the superblock, we should refresh the information recorded in the mp->m_sb.
Signed-off-by: Wu Guanghao wuguanghao3@huawei.com Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/xfs_buf_item_recover.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+)
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 3b6523c43a1b..368937745f80 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -22,6 +22,8 @@ #include "xfs_inode.h" #include "xfs_dir2.h" #include "xfs_quota.h" +#include "xfs_sb.h" +#include "xfs_ag.h"
/* * This is the number of entries in the l_buf_cancel_table used during @@ -969,6 +971,28 @@ xlog_recover_buf_commit_pass2( goto out_release; } else { xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); + /* + * If the superblock buffer is modified, we also need to modify the + * content of the mp. + */ + if (bp->b_maps[0].bm_bn == XFS_SB_DADDR && bp->b_ops) { + struct xfs_dsb *sb = bp->b_addr; + + bp->b_ops->verify_write(bp); + error = bp->b_error; + if (error) + goto out_release; + + if (be32_to_cpu(sb->sb_agcount) > mp->m_sb.sb_agcount) { + error = xfs_initialize_perag(mp, + be32_to_cpu(sb->sb_agcount), + &mp->m_maxagi); + if (error) + goto out_release; + } + + xfs_sb_from_disk(&mp->m_sb, sb); + } }
/*
Offering: HULK hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK
--------------------------------
KASAN reported a UAF bug while fault injection test:
================================================================== BUG: KASAN: use-after-free in xfs_inode_item_push+0x2db/0x2f0 Read of size 8 at addr ffff888022f74788 by task xfsaild/sda/479
CPU: 0 PID: 479 Comm: xfsaild/sda Not tainted 6.2.0-rc7-00003-ga8a43e2eb5f6 #89 Call Trace: <TASK> dump_stack_lvl+0x51/0x6a print_report+0x171/0x4a6 kasan_report+0xb7/0x130 xfs_inode_item_push+0x2db/0x2f0 xfsaild+0x729/0x1f70 kthread+0x290/0x340 ret_from_fork+0x1f/0x30 </TASK>
Allocated by task 494: kasan_save_stack+0x22/0x40 kasan_set_track+0x25/0x30 __kasan_slab_alloc+0x58/0x70 kmem_cache_alloc+0x197/0x5d0 xfs_inode_item_init+0x62/0x170 xfs_trans_ijoin+0x15e/0x240 xfs_init_new_inode+0x573/0x1820 xfs_create+0x6a1/0x1020 xfs_generic_create+0x544/0x5d0 vfs_mkdir+0x5d0/0x980 do_mkdirat+0x14e/0x220 __x64_sys_mkdir+0x6a/0x80 do_syscall_64+0x39/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd
Freed by task 14: kasan_save_stack+0x22/0x40 kasan_set_track+0x25/0x30 kasan_save_free_info+0x2e/0x40 __kasan_slab_free+0x114/0x1b0 kmem_cache_free+0xee/0x4e0 xfs_inode_free_callback+0x187/0x2a0 rcu_do_batch+0x317/0xce0 rcu_core+0x686/0xa90 __do_softirq+0x1b6/0x626
The buggy address belongs to the object at ffff888022f74758 which belongs to the cache xfs_ili of size 200 The buggy address is located 48 bytes inside of 200-byte region [ffff888022f74758, ffff888022f74820)
The buggy address belongs to the physical page: page:ffffea00008bdd00 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x22f74 head:ffffea00008bdd00 order:1 compound_mapcount:0 subpages_mapcount:0 compound_pincount:0 flags: 0x1fffff80010200(slab|head|node=0|zone=1|lastcpupid=0x1fffff) raw: 001fffff80010200 ffff888010ed4040 ffffea00008b2510 ffffea00008bde10 raw: 0000000000000000 00000000001a001a 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected
Memory state around the buggy address: ffff888022f74680: 00 00 00 00 00 00 00 00 00 00 00 00 00 fc fc fc ffff888022f74700: fc fc fc fc fc fc fc fc fc fc fc fa fb fb fb fb
ffff888022f74780: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
^ ffff888022f74800: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc ffff888022f74880: fc fc 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ==================================================================
When push inode item in xfsaild, it will race with reclaim inodes task. Consider the following call graph, both tasks deal with the same inode. During flushing the cluster, it will enter xfs_iflush_abort() in shutdown conditions, inode's XFS_IFLUSHING flag will be cleared and lip->li_buf set to null. Concurrently, inode will be reclaimed in shutdown conditions, there is no need to wait xfs buf lock because of lip->li_buf is null at this time, inode will be freed via rcu callback if xfsaild task schedule out during flushing the cluster. so, it is unsafe to reference lip after flushing the cluster in xfs_inode_item_push().
<log item is in AIL> <filesystem shutdown> spin_lock(&ailp->ail_lock) xfs_inode_item_push(lip) xfs_buf_trylock(bp) spin_unlock(&lip->li_ailp->ail_lock) xfs_iflush_cluster(bp) if (xfs_is_shutdown()) xfs_iflush_abort(ip) xfs_trans_ail_delete(ip) spin_lock(&ailp->ail_lock) spin_unlock(&ailp->ail_lock) xfs_iflush_abort_clean(ip) error = -EIO <log item removed from AIL> <log item li_buf set to null> if (error) xfs_force_shutdown() xlog_shutdown_wait(mp->m_log) might_sleep() xfs_reclaim_inode(ip) if (shutdown) xfs_iflush_shutdown_abort(ip) if (!bp) xfs_iflush_abort(ip) return __xfs_inode_free(ip) call_rcu(ip, xfs_inode_free_callback) ...... <rcu grace period expires> <rcu free callbacks run somewhere> xfs_inode_free_callback(ip) kmem_cache_free(ip->i_itemp) ...... <starts running again> xfs_buf_ioend_fail(bp); xfs_buf_ioend(bp) xfs_buf_relse(bp); return error spin_lock(&lip->li_ailp->ail_lock) <UAF on log item>
Fix the uaf by add XFS_ILOCK_SHARED lock in xfs_inode_item_push(), this prevents race conditions between inode item push and inode reclaim.
Fixes: 90c60e164012 ("xfs: xfs_iflush() is no longer necessary") Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/xfs_inode_item.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 7586b19b322b..fb1d482c7200 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -535,9 +535,14 @@ xfs_inode_item_push( if (xfs_iflags_test(ip, XFS_IFLUSHING)) return XFS_ITEM_FLUSHING;
- if (!xfs_buf_trylock(bp)) + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) return XFS_ITEM_LOCKED;
+ if (!xfs_buf_trylock(bp)) { + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return XFS_ITEM_LOCKED; + } + spin_unlock(&lip->li_ailp->ail_lock);
/* @@ -563,6 +568,7 @@ xfs_inode_item_push( }
spin_lock(&lip->li_ailp->ail_lock); + xfs_iunlock(ip, XFS_ILOCK_SHARED); return rval; }
Offering: HULK hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6I11V
--------------------------------
KASAN reported a UAF bug while fault injection test:
================================================================== BUG: KASAN: use-after-free in __list_del_entry_valid+0x2b1/0x2c0 Read of size 8 at addr ffff888023edb888 by task kworker/0:1/34
CPU: 0 PID: 34 Comm: kworker/0:1 Not tainted 5.10.0-07305-g4c00b418452b-dirty #369 Workqueue: xfs-reclaim/sda xfs_reclaim_worker Call Trace: dump_stack+0x115/0x16b print_address_description.constprop.0+0x2c/0x450 kasan_report.cold+0x5d/0xdb __asan_report_load8_noabort+0x20/0x30 __list_del_entry_valid+0x2b1/0x2c0 xfs_iflush_abort_clean+0x11c/0x290 xfs_iflush_abort+0xd2/0x2c0 xfs_iflush_shutdown_abort+0x2e3/0x580 xfs_icwalk_ag+0xe9d/0x1a00 xfs_reclaim_worker+0x29/0x50 process_one_work+0x71f/0x11d0 worker_thread+0x5cb/0x10a0 kthread+0x35b/0x490 ret_from_fork+0x1f/0x30
Allocated by task 642: kasan_save_stack+0x23/0x60 __kasan_kmalloc.constprop.0+0xd9/0x140 kasan_slab_alloc+0x12/0x20 kmem_cache_alloc+0x1c4/0xa50 _xfs_buf_alloc+0x72/0xd50 xfs_buf_get_map+0x156/0x7c0 xfs_trans_get_buf_map+0x41c/0x8c0 xfs_ialloc_inode_init+0x455/0xaf0 xfs_ialloc_ag_alloc+0x71f/0x1790 xfs_dialloc+0x3f9/0x8a0 xfs_ialloc+0x12e/0x1970 xfs_dir_ialloc+0x144/0x730 xfs_create+0x623/0xe80 xfs_generic_create+0x571/0x820 xfs_vn_create+0x31/0x40 path_openat+0x209d/0x3b10 do_filp_open+0x1c2/0x2e0 do_sys_openat2+0x4fc/0x900 do_sys_open+0xd8/0x150 __x64_sys_open+0x87/0xd0 do_syscall_64+0x45/0x70 entry_SYSCALL_64_after_hwframe+0x61/0xc6
The buggy address belongs to the object at ffff888023edb780 which belongs to the cache xfs_buf of size 392 The buggy address is located 264 bytes inside of 392-byte region [ffff888023edb780, ffff888023edb908) The buggy address belongs to the page: page:ffffea00008fb600 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff888023edb780 pfn:0x23ed8 head:ffffea00008fb600 order:2 compound_mapcount:0 compound_pincount:0 flags: 0x1fffff80010200(slab|head|node=0|zone=1|lastcpupid=0x1fffff) raw: 001fffff80010200 ffffea00008fb008 ffff888019016050 ffff888018ff4300 raw: ffff888023edb780 000000000013000d 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected
Memory state around the buggy address: ffff888023edb780: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff888023edb800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff888023edb880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
^ ffff888023edb900: fb fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff888023edb980: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ==================================================================
This is a low probability problem, it tooks me a long time to find the process that the problem occurred:
1. When creating a new file, if there are no free inodes, we need to allocate a new chunk. The buf item and inode items associated with inode will be submitted to CIL independently. If all goes well, both the buf item and the inode item will be inserted into the AIL, and the buf item will be in front of the inode item.
2. At the first time, xfsaild only pushed buf item. If an error occurs while writing back the inode buffer, the inode item will be set XFS_LI_FAILED in xfs_buf_inode_io_fail() when buf io end, and the buf item will remain in the AIL.
3. At the second time, xfsaild only pushed buf item again, while writing back the inode buffer and the log has shut down, the inode buffer will be set XBF_STALE and the buf item is removed from AIL when buf io end. Because of inode is not flushed, ili_last_fields in xfs_inode is still 0, so inode item will left in AIL.
4. Concurrently, a new transaction log inode that in the same cluster as the previous inode, it will get the same inode buffer in xfs_buf_find(), _XBF_INODES flag will be cleared in xfs_buf_find() due to buffer is staled.
5. At the third time, xfsaild push the inode item that has marked XFS_LI_FAILED, AIL will resubmit the inode item in xfsaild_resubmit_item(). It will go to the wrong code path due to inode buffer missing _XBF_INODES flag, all inode items that in bp->b_li_list will be reduced the references to buffer, and inode item's li_buf set to null, but inode item still in bp->b_li_list. After all reference count decreasing the inode buffer will be freed.
6. When xfs reclaim inode, remove inode item from bp->b_li_list will cause a uaf xfs_iflush_abort_clean().
Fix it by add xfs shutdown condition check in xfs_buf_find(), if it has been shutdown, it is useless to get the buffer. While the inode item is still reference to the inode buffer, the _XBF_INODES flag will not be missing.
Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/xfs_buf.c | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 73adabb98aa2..0d3d057c4af4 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -657,6 +657,11 @@ xfs_buf_find( XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited); }
+ if (xlog_is_shutdown(btp->bt_mount->m_log)) { + xfs_buf_relse(bp); + return -EIO; + } + /* * if the buffer is stale, clear all the external state associated with * it. We need to keep flags such as how we allocated the buffer memory
From: Zhang Yi yi.zhang@huawei.com
Offering: HULK hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK
--------------------------------
Factor out a wrapper __xfs_da3_node_read() from xfs_da3_node_read() which could pass flags parameter.
Signed-off-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/libxfs/xfs_da_btree.c | 5 +++-- fs/xfs/libxfs/xfs_da_btree.h | 15 +++++++++++++-- 2 files changed, 16 insertions(+), 4 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 66b646269d29..10e93c9ce827 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -366,16 +366,17 @@ xfs_da3_node_set_type( }
int -xfs_da3_node_read( +__xfs_da3_node_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, + unsigned int flags, struct xfs_buf **bpp, int whichfork) { int error;
- error = xfs_da_read_buf(tp, dp, bno, 0, bpp, whichfork, + error = xfs_da_read_buf(tp, dp, bno, flags, bpp, whichfork, &xfs_da3_node_buf_ops); if (error || !*bpp || !tp) return error; diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index ad5dd324631a..adb3c4419051 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -188,11 +188,22 @@ int xfs_da3_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, */ int xfs_da3_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, xfs_da_state_blk_t *new_blk); -int xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t bno, struct xfs_buf **bpp, int whichfork); +int __xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, unsigned int flags, + struct xfs_buf **bpp, int whichfork); int xfs_da3_node_read_mapped(struct xfs_trans *tp, struct xfs_inode *dp, xfs_daddr_t mappedbno, struct xfs_buf **bpp, int whichfork); +static inline int +xfs_da3_node_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + struct xfs_buf **bpp, + int whichfork) +{ + return __xfs_da3_node_read(tp, dp, bno, 0, bpp, whichfork); +}
/* * Utility routines.
From: Zhang Yi yi.zhang@huawei.com
Offering: HULK hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I76JSK
--------------------------------
When inactiving an unlinked inode and it's attrs, if xlog is shutdown either during or just after the process of recurse deleting attribute nodes/leafs in xfs_attr3_root_inactive(), the log will records some buffer cancel items, but doesn't contain the corresponding extent entries and inode updates, this is incomplete and inconsistent. Because of the inactiving process is not completed and the unlinked inode is still in the agi_unlinked table, it will continue to be inactived after replaying the log on the next mount, the attr node/leaf blocks' created record before the cancel items could not be replayed but the inode does. So we could get corrupted data when reading the canceled blocks.
XFS (pmem0): Metadata corruption detected at xfs_da3_node_read_verify+0x53/0x220, xfs_da3_node block 0x78 XFS (pmem0): Unmount and run xfs_repair XFS (pmem0): First 128 bytes of corrupted metadata buffer: 00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000040: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000050: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000060: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000070: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ XFS (pmem0): metadata I/O error in "xfs_da_read_buf+0x104/0x190" at daddr 0x78 len 8 error 117
In order to fix the issue, we need to remove the extent entries, update inode and attr btree atomically when staling attr node/leaf blocks. And note that we may also need to log and update the parent attr node entry when removing child or leaf attr block. Fortunately, it doesn't have to be so complicated, we could leave the removed entres as holes and skip them if we need to do re-inactiving, the whole node tree will be removed completely in the end.
Cc: stable@vger.kernel.org Signed-off-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Long Li leo.lilong@huawei.com --- fs/xfs/xfs_attr_inactive.c | 62 ++++++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 16 deletions(-)
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 5993960d288f..a0acbe3a7e3d 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -23,6 +23,7 @@ #include "xfs_quota.h" #include "xfs_dir2.h" #include "xfs_error.h" +#include "xfs_defer.h"
/* * Invalidate any incore buffers associated with this remote attribute value @@ -139,7 +140,8 @@ xfs_attr3_node_inactive( xfs_daddr_t parent_blkno, child_blkno; struct xfs_buf *child_bp; struct xfs_da3_icnode_hdr ichdr; - int error, i; + int error, i, done; + xfs_filblks_t count = mp->m_attr_geo->fsbcount;
/* * Since this code is recursive (gasp!) we must protect ourselves. @@ -172,10 +174,13 @@ xfs_attr3_node_inactive( * traversal of the tree so we may deal with many blocks * before we come back to this one. */ - error = xfs_da3_node_read(*trans, dp, child_fsb, &child_bp, - XFS_ATTR_FORK); + error = __xfs_da3_node_read(*trans, dp, child_fsb, + XFS_DABUF_MAP_HOLE_OK, &child_bp, + XFS_ATTR_FORK); if (error) return error; + if (!child_bp) + goto next_entry;
/* save for re-read later */ child_blkno = xfs_buf_daddr(child_bp); @@ -207,14 +212,32 @@ xfs_attr3_node_inactive( * Remove the subsidiary block from the cache and from the log. */ error = xfs_trans_get_buf(*trans, mp->m_ddev_targp, - child_blkno, - XFS_FSB_TO_BB(mp, mp->m_attr_geo->fsbcount), 0, - &child_bp); + child_blkno, XFS_FSB_TO_BB(mp, count), + 0, &child_bp); if (error) return error; + + error = xfs_bunmapi(*trans, dp, child_fsb, count, + XFS_BMAPI_ATTRFORK, 0, &done); + if (error) { + xfs_trans_brelse(*trans, child_bp); + return error; + } xfs_trans_binval(*trans, child_bp); + + error = xfs_defer_finish(trans); + if (error) + return error; child_bp = NULL;
+ /* + * Atomically commit the whole invalidate stuff. + */ + error = xfs_trans_roll_inode(trans, dp); + if (error) + return error; + +next_entry: /* * If we're not done, re-read the parent to get the next * child block number. @@ -232,12 +255,6 @@ xfs_attr3_node_inactive( xfs_trans_brelse(*trans, bp); bp = NULL; } - /* - * Atomically commit the whole invalidate stuff. - */ - error = xfs_trans_roll_inode(trans, dp); - if (error) - return error; }
return 0; @@ -258,7 +275,8 @@ xfs_attr3_root_inactive( struct xfs_da_blkinfo *info; struct xfs_buf *bp; xfs_daddr_t blkno; - int error; + xfs_filblks_t count = mp->m_attr_geo->fsbcount; + int error, done;
/* * Read block 0 to see what we have to work with. @@ -266,8 +284,9 @@ xfs_attr3_root_inactive( * the extents in reverse order the extent containing * block 0 must still be there. */ - error = xfs_da3_node_read(*trans, dp, 0, &bp, XFS_ATTR_FORK); - if (error) + error = __xfs_da3_node_read(*trans, dp, 0, XFS_DABUF_MAP_HOLE_OK, + &bp, XFS_ATTR_FORK); + if (error || !bp) return error; blkno = bp->b_bn;
@@ -298,7 +317,7 @@ xfs_attr3_root_inactive( * Invalidate the incore copy of the root block. */ error = xfs_trans_get_buf(*trans, mp->m_ddev_targp, blkno, - XFS_FSB_TO_BB(mp, mp->m_attr_geo->fsbcount), 0, &bp); + XFS_FSB_TO_BB(mp, count), 0, &bp); if (error) return error; error = bp->b_error; @@ -306,7 +325,17 @@ xfs_attr3_root_inactive( xfs_trans_brelse(*trans, bp); return error; } + + error = xfs_bunmapi(*trans, dp, 0, count, XFS_BMAPI_ATTRFORK, 0, &done); + if (error) { + xfs_trans_brelse(*trans, bp); + return error; + } xfs_trans_binval(*trans, bp); /* remove from cache */ + + error = xfs_defer_finish(trans); + if (error) + return error; /* * Commit the invalidate and start the next transaction. */ @@ -369,6 +398,7 @@ xfs_attr_inactive( if (error) goto out_shutdown;
+ /* Remove the potential leftover remote attr blocks. */ error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0); if (error) goto out_cancel;
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/953 邮件列表地址: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/thread/H3...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/953 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/thread/H3...