Darrick J. Wong (1): xfs: hoist multi-fsb allocation unit detection to a helper
Zhang Yi (4): xfs: reserve blocks for truncating large realtime inode iomap: don't mark blocks uptodate after partial zeroing iomap: reduce unnecessary state_lock when setting ifs uptodate and dirty bits iomap: optimize setting uptodate bit
Zhihao Cheng (2): ext4: ext4_iomap_map_blocks: Fix null pointer deference in nojournal mode iomap: improve iomap_folio_mkwrite_iter and ifs_clear_range_dirty
fs/ext4/inode.c | 2 +- fs/iomap/buffered-io.c | 50 +++++++++++++++++++++++++++++++++++++----- fs/xfs/xfs_bmap_util.c | 4 ++-- fs/xfs/xfs_inode.h | 9 ++++++++ fs/xfs/xfs_iops.c | 15 ++++++++++++- 5 files changed, 70 insertions(+), 10 deletions(-)
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/11044 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/F...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/11044 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/F...
From: "Darrick J. Wong" djwong@kernel.org
mainline inclusion from mainline-v6.10-rc1 commit 6b700a5be9b3b69419474622336c63fdc1cc3ca4 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9DN5Z CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Replace the open-coded logic to decide if a file has a multi-fsb allocation unit to a helper to make the code easier to read.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Conflicts: fs/xfs/xfs_bmap_util.c [ 5f57f7309d9ab9("xfs: create rt extent rounding helpers for realtime extent blocks") is not applied. ] Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Signed-off-by: Zhihao Cheng chengzhihao@huaweicloud.com --- fs/xfs/xfs_bmap_util.c | 4 ++-- fs/xfs/xfs_inode.h | 9 +++++++++ 2 files changed, 11 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index ad4aba5002c1..a38f1de3fd66 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -689,7 +689,7 @@ xfs_can_free_eofblocks( * forever. */ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); - if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) + if (xfs_inode_has_bigrtalloc(ip)) end_fsb = roundup_64(end_fsb, mp->m_sb.sb_rextsize); last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); if (last_fsb <= end_fsb) @@ -990,7 +990,7 @@ xfs_free_file_space( endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
/* We can only free complete realtime extents. */ - if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) { + if (xfs_inode_has_bigrtalloc(ip)) { startoffset_fsb = roundup_64(startoffset_fsb, mp->m_sb.sb_rextsize); endoffset_fsb = rounddown_64(endoffset_fsb, diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 3beb470f1892..0d2c1e5046ed 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -305,6 +305,15 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) return ip->i_diflags2 & XFS_DIFLAG2_NREXT64; }
+/* + * Decide if this file is a realtime file whose data allocation unit is larger + * than a single filesystem block. + */ +static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip) +{ + return XFS_IS_REALTIME_INODE(ip) && ip->i_mount->m_sb.sb_rextsize > 1; +} + /* * Return the buftarg used for data allocations on a given inode. */
From: Zhang Yi yi.zhang@huawei.com
mainline inclusion from mainline-v6.11-rc1 commit d048945150b798147b324f05f7e8c857772b0d3f category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9DN5Z CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
When unaligned truncate down a big realtime file, xfs_truncate_page() only zeros out the tail EOF block, __xfs_bunmapi() should split the tail written extent and convert the later one that beyond EOF block to unwritten, but it couldn't work as expected now since the reserved block is zero in xfs_setattr_size(), this could expose stale data just after commit '943bc0882ceb ("iomap: don't increase i_size if it's not a write operation")'.
If we truncate file that contains a large enough written extent:
|< rxext >|< rtext >| ...WWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWW ^ (new EOF) ^ old EOF
Since we only zeros out the tail of the EOF block, and xfs_itruncate_extents()->..->__xfs_bunmapi() unmap the whole ailgned extents, it becomes this state:
|< rxext >| ...WWWzWWWWWWWWWWWWW ^ new EOF
Then if we do an extending write like this, the blocks in the previous tail extent becomes stale:
|< rxext >| ...WWWzSSSSSSSSSSSSS..........WWWWWWWWWWWWWWWWW ^ old EOF ^ append start ^ new EOF
Fix this by reserving XFS_DIOSTRAT_SPACE_RES blocks for big realtime inode.
Signed-off-by: Zhang Yi yi.zhang@huawei.com Link: https://lore.kernel.org/r/20240618142112.1315279-2-yi.zhang@huaweicloud.com Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Christian Brauner brauner@kernel.org Conflicts: fs/xfs/xfs_iops.c [ 3fed24fffc76dd("xfs: Replace xfs_isilocked with xfs_assert_ilocked") is not applied. ] Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Signed-off-by: Zhihao Cheng chengzhihao@huaweicloud.com --- fs/xfs/xfs_iops.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index b8ec045708c3..caba648e0ed2 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -17,6 +17,8 @@ #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_trans.h" +#include "xfs_trans_space.h" +#include "xfs_bmap_btree.h" #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_symlink.h" @@ -794,6 +796,7 @@ xfs_setattr_size( struct xfs_trans *tp; int error; uint lock_flags = 0; + uint resblks = 0; bool did_zeroing = false;
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); @@ -901,7 +904,17 @@ xfs_setattr_size( return error; }
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + /* + * For realtime inode with more than one block rtextsize, we need the + * block reservation for bmap btree block allocations/splits that can + * happen since it could split the tail written extent and convert the + * right beyond EOF one to unwritten. + */ + if (xfs_inode_has_bigrtalloc(ip)) + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks, + 0, 0, &tp); if (error) return error;
From: Zhihao Cheng chengzhihao1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9DN5Z CVE: NA
--------------------------------
The 'journal' could be NULL in nojournal mode, which causes a null-ptr-def problem in ext4_iomap_map_blocks().
Fixes: 7f6416dcd4a3 ("ext4: implement writeback iomap path") Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Signed-off-by: Zhihao Cheng chengzhihao@huaweicloud.com --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f9d5291db42b..9b877f4732f6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3878,7 +3878,7 @@ static int ext4_iomap_map_blocks(struct iomap_writepage_ctx *wpc, * ext4_count_free_blocks() is non-zero, a commit * should free up blocks. */ - if (ret == -ENOSPC && ext4_count_free_clusters(sb)) { + if (ret == -ENOSPC && journal && ext4_count_free_clusters(sb)) { jbd2_journal_force_commit_nested(journal); goto retry; }
From: Zhang Yi yi.zhang@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9DN5Z CVE: NA
--------------------------------
In __iomap_write_begin(), if we unaligned buffered write data to a hole of a regular file, we only zero out the place where aligned to block size that we don't want to write, but mark the whole range uptodate if block size < folio size. This is wrong since the not zeroed part will contains stale data and can be accessed by a concurrent buffered read easily (on the filesystem may not hold inode->i_rwsem) once we mark the range uptodate. At the same time, in the reading data branch, it's also unnecessary to set the just read range uptodate since we are going to set it immediately in __iomap_write_end(). Hence fix this by just drop iomap_set_range_uptodate() in the zeroing out branch.
Fixes: 9dc55f1389f9 ("iomap: add support for sub-pagesize buffered I/O without buffer heads") Reported-by: Matthew Wilcox willy@infradead.org Closes: https://lore.kernel.org/all/ZqsN5ouQTEc1KAzV@casper.infradead.org/ Signed-off-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Signed-off-by: Zhihao Cheng chengzhihao@huaweicloud.com --- fs/iomap/buffered-io.c | 1 - 1 file changed, 1 deletion(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index e705db1b53c9..42402a6de4eb 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -761,7 +761,6 @@ int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, if (status) return status; } - iomap_set_range_uptodate(folio, poff, plen); } while ((block_start += plen) < block_end);
return 0;
From: Zhang Yi yi.zhang@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9DN5Z CVE: NA
--------------------------------
When doing buffered write, we set uptodate and drity bits of the written range separately, it holds the ifs->state_lock twice when blocksize < folio size, which is redundant. After large folio is supported, the spinlock could affect more about the performance, merge them could reduce some unnecessary locking overhead and gets some performance gain.
Suggested-by: Dave Chinner david@fromorbit.com Signed-off-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Darrick J. Wong djwong@kernel.org Conflicts: fs/iomap/buffered-io.c [ 15d09f865dc4("iomap: export __iomap_write_{begin|end}") is applied. ] Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Signed-off-by: Zhihao Cheng chengzhihao@huaweicloud.com --- fs/iomap/buffered-io.c | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 42402a6de4eb..414b7bdde787 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -211,6 +211,37 @@ bool iomap_is_fully_dirty(struct folio *folio, size_t from, size_t count) } EXPORT_SYMBOL_GPL(iomap_is_fully_dirty);
+static void ifs_set_range_dirty_uptodate(struct folio *folio, + struct iomap_folio_state *ifs, size_t off, size_t len) +{ + struct inode *inode = folio->mapping->host; + unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); + unsigned int first_blk = (off >> inode->i_blkbits); + unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; + unsigned int nr_blks = last_blk - first_blk + 1; + unsigned long flags; + + spin_lock_irqsave(&ifs->state_lock, flags); + bitmap_set(ifs->state, first_blk, nr_blks); + if (ifs_is_fully_uptodate(folio, ifs)) + folio_mark_uptodate(folio); + bitmap_set(ifs->state, first_blk + blks_per_folio, nr_blks); + spin_unlock_irqrestore(&ifs->state_lock, flags); +} + +static void iomap_set_range_dirty_uptodate(struct folio *folio, + size_t off, size_t len) +{ + struct iomap_folio_state *ifs = folio->private; + + if (ifs) + ifs_set_range_dirty_uptodate(folio, ifs, off, len); + else + folio_mark_uptodate(folio); + + filemap_dirty_folio(folio->mapping, folio); +} + static struct iomap_folio_state *ifs_alloc(struct inode *inode, struct folio *folio, unsigned int flags) { @@ -867,6 +898,8 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len, size_t copied, struct folio *folio) { + size_t from = offset_in_folio(folio, pos); + flush_dcache_folio(folio);
/* @@ -882,9 +915,8 @@ bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len, */ if (unlikely(copied < len && !folio_test_uptodate(folio))) return false; - iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len); - iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied); - filemap_dirty_folio(inode->i_mapping, folio); + + iomap_set_range_dirty_uptodate(folio, from, copied); return true; } EXPORT_SYMBOL_GPL(__iomap_write_end);
From: Zhang Yi yi.zhang@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9DN5Z CVE: NA
--------------------------------
When overwriting an already uptodate folio, we don't need to set the range uptodate again. This could save a barrier and some bit operations if block size < folio size.
Suggested-by: Matthew Wilcox willy@infradead.org Signed-off-by: Zhang Yi yi.zhang@huawei.com Conflicts: fs/iomap/buffered-io.c [ 15d09f865dc4("iomap: export __iomap_write_{begin|end}") is applied. ] Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Signed-off-by: Zhihao Cheng chengzhihao@huaweicloud.com --- fs/iomap/buffered-io.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 414b7bdde787..09f410f5d637 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -916,7 +916,12 @@ bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len, if (unlikely(copied < len && !folio_test_uptodate(folio))) return false;
- iomap_set_range_dirty_uptodate(folio, from, copied); + if (folio_test_uptodate(folio)) { + iomap_set_range_dirty(folio, from, copied); + filemap_dirty_folio(folio->mapping, folio); + } else { + iomap_set_range_dirty_uptodate(folio, from, copied); + } return true; } EXPORT_SYMBOL_GPL(__iomap_write_end);
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9DN5Z CVE: NA
--------------------------------
Improve ifs_clear_range_dirty() by using shift operating. Improve iomap_folio_mkwrite_iter() by making folio dirty once.
Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Signed-off-by: Zhihao Cheng chengzhihao@huaweicloud.com --- fs/iomap/buffered-io.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 09f410f5d637..fe64886e4eae 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -136,7 +136,8 @@ static void ifs_clear_range_dirty(struct folio *folio, { struct inode *inode = folio->mapping->host; unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); - unsigned int first_blk = DIV_ROUND_UP(off, i_blocksize(inode)); + unsigned int first_blk = round_up(off, i_blocksize(inode)) >> + inode->i_blkbits; unsigned int last_blk = (off + len) >> inode->i_blkbits; unsigned int nr_blks = last_blk - first_blk; unsigned long flags; @@ -1559,7 +1560,6 @@ static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter,
ifs_alloc(iter->inode, folio, 0); iomap_set_range_dirty(folio, 0, length); - filemap_dirty_folio(iter->inode->i_mapping, folio); }
return length; @@ -1583,6 +1583,8 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = iomap_folio_mkwrite_iter(&iter, folio);
+ if (iter.pos > folio_pos(folio)) + filemap_dirty_folio(folio->mapping, folio); if (ret < 0) goto out_unlock; folio_wait_stable(folio);