Yang Erkun (5): iomap: add iomap_is_fully_dirty iomap: export __iomap_write_{begin|end} ext4: fallback to generic_perform_write once iov_iter_count <= PAGE_SIZE iomap: export iomap_clear_range_dirty ext4: add ext4_iomap_invalidate_folio
fs/ext4/file.c | 3 +- fs/ext4/inode.c | 140 +++++++++++++++++++++++++++++++++++- fs/iomap/buffered-io.c | 44 +++++++++++- include/linux/iomap.h | 6 ++ include/trace/events/ext4.h | 15 ++++ 5 files changed, 203 insertions(+), 5 deletions(-)
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9DN5Z CVE: NA
--------------------------------
Like iomap_is_partially_uptodate, add iomap_is_fully_dirty, which will used with latter patch.
Signed-off-by: Yang Erkun yangerkun@huawei.com --- fs/iomap/buffered-io.c | 35 +++++++++++++++++++++++++++++++++++ include/linux/iomap.h | 1 + 2 files changed, 36 insertions(+)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index fee0bb9b5d75..1efc380b5efe 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -177,6 +177,41 @@ static void iomap_set_range_dirty(struct folio *folio, size_t off, size_t len) ifs_set_range_dirty(folio, ifs, off, len); }
+/* + * iomap_is_fully_dirty checks whether blocks within a folio are + * dirty or not. + * + * Returns true if all blocks which correspond to the specified part + * of the folio are dirty. + */ +bool iomap_is_fully_dirty(struct folio *folio, size_t from, size_t count) +{ + struct iomap_folio_state *ifs = folio->private; + struct inode *inode = folio->mapping->host; + unsigned first, last, i; + unsigned int nr_blocks = i_blocks_per_folio(inode, folio); + + if ((nr_blocks <= 1) && folio_test_dirty(folio)) + return true; + + if (!ifs) + return false; + + /* Caller's range may extend past the end of this folio */ + count = min(folio_size(folio) - from, count); + + /* First and last blocks in range within folio */ + first = from >> inode->i_blkbits; + last = (from + count - 1) >> inode->i_blkbits; + + for (i = first; i <= last; i++) + if (!ifs_block_is_dirty(folio, ifs, i)) + return false; + + return true; +} +EXPORT_SYMBOL_GPL(iomap_is_fully_dirty); + static struct iomap_folio_state *ifs_alloc(struct inode *inode, struct folio *folio, unsigned int flags) { diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 6fc1c858013d..9b5995e029b4 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -265,6 +265,7 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode, int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); +bool iomap_is_fully_dirty(struct folio *, size_t from, size_t count); struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len); bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags); void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9DN5Z CVE: NA
--------------------------------
This two functions will use with latter patch, export it.
Signed-off-by: Yang Erkun yangerkun@huawei.com --- fs/iomap/buffered-io.c | 6 ++++-- include/linux/iomap.h | 4 ++++ 2 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 1efc380b5efe..a989953007fa 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -700,7 +700,7 @@ static int iomap_read_folio_sync(loff_t block_start, struct folio *folio, return submit_bio_wait(&bio); }
-static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, +int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, size_t len, struct folio *folio) { const struct iomap *srcmap = iomap_iter_srcmap(iter); @@ -762,6 +762,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
return 0; } +EXPORT_SYMBOL_GPL(__iomap_write_begin);
static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len) @@ -860,7 +861,7 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, return status; }
-static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len, +bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len, size_t copied, struct folio *folio) { flush_dcache_folio(folio); @@ -883,6 +884,7 @@ static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len, filemap_dirty_folio(inode->i_mapping, folio); return true; } +EXPORT_SYMBOL_GPL(__iomap_write_end);
static void iomap_write_end_inline(const struct iomap_iter *iter, struct folio *folio, loff_t pos, size_t copied) diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 9b5995e029b4..4d9eacc9d7c3 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -258,6 +258,10 @@ static inline const struct iomap *iomap_iter_srcmap(const struct iomap_iter *i)
ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops); +int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, + size_t len, struct folio *folio); +bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len, + size_t copied, struct folio *folio); int iomap_file_buffered_write_punch_delalloc(struct inode *inode, struct iomap *iomap, loff_t pos, loff_t length, ssize_t written, int (*punch)(struct inode *inode, loff_t pos, loff_t length));
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9DN5Z CVE: NA
--------------------------------
iomap_file_buffered_write will always call iomap_iter to find the iomap for buffer io, and then call iomap_write_iter to get and fill the folio, which is quiet complicated compare to generic_perform_write. Unixbench test for ext4 show this with a performance degradation since we switch buffer io for ext4 to iomap.
Fix it by support .write_begin and .write_end, and for the case write range has already been mark as dirty, no need to get the iomap.
Signed-off-by: Yang Erkun yangerkun@huawei.com --- fs/ext4/file.c | 3 +- fs/ext4/inode.c | 124 ++++++++++++++++++++++++++++++++++++ include/trace/events/ext4.h | 15 +++++ 3 files changed, 141 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 67b0e2212ca0..27d4eff79941 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -314,7 +314,8 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, if (ret <= 0) goto out;
- if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP)) + if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP) && + iov_iter_count(from) > PAGE_SIZE) ret = ext4_iomap_buffered_write(iocb, from); else ret = generic_perform_write(iocb, from); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 93a9dd03cb5c..d6506c27f971 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3955,6 +3955,128 @@ static int ext4_iomap_writepages(struct address_space *mapping, return ret; }
+static int ext4_iomap_write_begin(struct file *file, + struct address_space *mapping, loff_t pos, + unsigned len, struct page **pagep, + void **fsdata) +{ + struct inode *inode = mapping->host; + struct iomap_iter iter = { + .inode = inode, + .flags = IOMAP_WRITE, + }; + int ret = 0, retries = 0; + struct folio *folio; + bool delalloc; + + if (unlikely(ext4_forced_shutdown(inode->i_sb))) + return -EIO; + + trace_ext4_iomap_write_begin(inode, pos, len); + + delalloc = test_opt(inode->i_sb, DELALLOC) && + !ext4_nonda_switch(inode->i_sb); + *fsdata = delalloc ? (void *)0 : (void *)FALL_BACK_TO_NONDELALLOC; + +retry: + iter.pos = pos; + iter.len = len; + + folio = iomap_get_folio(&iter, pos, len); + if (IS_ERR(folio)) + return PTR_ERR(folio); + + WARN_ON_ONCE(pos + len > folio_pos(folio) + folio_size(folio)); + + if (iomap_is_fully_dirty(folio, offset_in_folio(folio, pos), len)) + goto out; + + do { + int length; + + ret = __ext4_iomap_buffered_io_begin(inode, iter.pos, iter.len, + iter.flags, &iter.iomap, NULL, delalloc); + if (ret) + goto out; + + WARN_ON_ONCE(iter.iomap.offset > iter.pos); + WARN_ON_ONCE(iter.iomap.length == 0); + WARN_ON_ONCE(iter.iomap.offset + iter.iomap.length <= iter.pos); + + length = iomap_length(&iter); + ret = __iomap_write_begin(&iter, iter.pos, length, folio); + if (ret) + goto out; + + iter.pos += length; + iter.len -= length; + } while (iter.len); + +out: + if (ret < 0) { + folio_unlock(folio); + folio_put(folio); + + /* + * __ext4_iomap_buffered_io_begin() may have instantiated + * a few blocks outside i_size. Trim these off again. Don't + * need i_size_read because we hold inode lock. + */ + if (pos + len > inode->i_size) + ext4_truncate_failed_write(inode); + + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + } + + *pagep = folio_file_page(folio, pos >> PAGE_SHIFT); + return ret; +} + +static int ext4_iomap_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + int write_mode = (int)(unsigned long)fsdata; + struct folio *folio = page_folio(page); + loff_t old_size = inode->i_size; + size_t written; + + trace_ext4_iomap_write_end(inode, pos, len, copied); + + written = __iomap_write_end(inode, pos, len, copied, folio) ? + copied : 0; + + /* + * Update the in-memory inode size after copying the data into + * the page cache. It's important to update i_size while still + * holding folio lock, because folio writeout could otherwise + * come in and zero beyond i_size. + */ + if (pos + written > old_size) + i_size_write(inode, pos + written); + + folio_unlock(folio); + folio_put(folio); + + if (old_size < pos) + pagecache_isize_extended(inode, old_size, pos); + + /* + * For delalloc, if we have pre-allocated more blocks and copied + * less, we will have delalloc extents allocated outside i_size, + * drop pre-allocated blocks that were not used, prevent the + * write back path from allocating blocks for them. + */ + if (unlikely(!written) && write_mode != FALL_BACK_TO_NONDELALLOC) + ext4_truncate_failed_write(inode); + + return written; +} + /* * For data=journal mode, folio should be marked dirty only when it was * writeably mapped. When that happens, it was already attached to the @@ -4048,6 +4170,8 @@ static const struct address_space_operations ext4_iomap_aops = { .read_folio = ext4_iomap_read_folio, .readahead = ext4_iomap_readahead, .writepages = ext4_iomap_writepages, + .write_begin = ext4_iomap_write_begin, + .write_end = ext4_iomap_write_end, .dirty_folio = iomap_dirty_folio, .bmap = ext4_bmap, .invalidate_folio = iomap_invalidate_folio, diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 588991b57c12..d500568daeb1 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -389,6 +389,13 @@ DEFINE_EVENT(ext4__write_begin, ext4_da_write_begin, TP_ARGS(inode, pos, len) );
+DEFINE_EVENT(ext4__write_begin, ext4_iomap_write_begin, + + TP_PROTO(struct inode *inode, loff_t pos, unsigned int len), + + TP_ARGS(inode, pos, len) +); + DECLARE_EVENT_CLASS(ext4__write_end, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, unsigned int copied), @@ -441,6 +448,14 @@ DEFINE_EVENT(ext4__write_end, ext4_da_write_end, TP_ARGS(inode, pos, len, copied) );
+DEFINE_EVENT(ext4__write_end, ext4_iomap_write_end, + + TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, + unsigned int copied), + + TP_ARGS(inode, pos, len, copied) +); + TRACE_EVENT(ext4_writepages, TP_PROTO(struct inode *inode, struct writeback_control *wbc),
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9DN5Z CVE: NA
--------------------------------
Will use it with latter patch.
Signed-off-by: Yang Erkun yangerkun@huawei.com --- fs/iomap/buffered-io.c | 3 ++- include/linux/iomap.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index a989953007fa..b38c666b4ac9 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -146,13 +146,14 @@ static void ifs_clear_range_dirty(struct folio *folio, spin_unlock_irqrestore(&ifs->state_lock, flags); }
-static void iomap_clear_range_dirty(struct folio *folio, size_t off, size_t len) +void iomap_clear_range_dirty(struct folio *folio, size_t off, size_t len) { struct iomap_folio_state *ifs = folio->private;
if (ifs) ifs_clear_range_dirty(folio, ifs, off, len); } +EXPORT_SYMBOL_GPL(iomap_clear_range_dirty);
static void ifs_set_range_dirty(struct folio *folio, struct iomap_folio_state *ifs, size_t off, size_t len) diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 4d9eacc9d7c3..e0320f2fae65 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -270,6 +270,7 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); bool iomap_is_fully_dirty(struct folio *, size_t from, size_t count); +void iomap_clear_range_dirty(struct folio *, size_t off, size_t len); struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len); bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags); void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9DN5Z CVE: NA
--------------------------------
We check does the range we write is already dirty to help bypass get iomap, which help fix the performance degradation like unixbench. But the iomap_invalidate_folio will only clear dirty once all size in folio has been invalidate. Add ext4_iomap_invalidate_folio to handle the case.
Signed-off-by: Yang Erkun yangerkun@huawei.com --- fs/ext4/inode.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d6506c27f971..3a87eb44039d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4115,6 +4115,20 @@ static int ext4_iomap_swap_activate(struct swap_info_struct *sis, &ext4_iomap_report_ops); }
+static void ext4_iomap_invalidate_folio(struct folio *folio, size_t offset, + size_t len) +{ + struct inode *inode = folio->mapping->host; + size_t start, end; + + if (offset == 0 && len == folio_size(folio)) + return iomap_invalidate_folio(folio, offset, len); + + start = round_up(offset, EXT4_BLOCK_SIZE(inode->i_sb)); + end = round_down(offset + len, EXT4_BLOCK_SIZE(inode->i_sb)); + iomap_clear_range_dirty(folio, start, end - start); +} + static const struct address_space_operations ext4_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, @@ -4174,7 +4188,7 @@ static const struct address_space_operations ext4_iomap_aops = { .write_end = ext4_iomap_write_end, .dirty_folio = iomap_dirty_folio, .bmap = ext4_bmap, - .invalidate_folio = iomap_invalidate_folio, + .invalidate_folio = ext4_iomap_invalidate_folio, .release_folio = iomap_release_folio, .direct_IO = noop_direct_IO, .migrate_folio = filemap_migrate_folio,
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/7225 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/J...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/7225 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/J...