hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9DN5Z CVE: NA
--------------------------------
iomap_file_buffered_write will always call iomap_iter to find the iomap for buffer io, and then call iomap_write_iter to get and fill the folio, which is quiet complicated compare to generic_perform_write. Unixbench test for ext4 show this with a performance degradation since we switch buffer io for ext4 to iomap.
Fix it by support .write_begin and .write_end, and for the case write range has already been mark as dirty, no need to get the iomap.
Signed-off-by: Yang Erkun yangerkun@huawei.com --- fs/ext4/file.c | 3 +- fs/ext4/inode.c | 124 ++++++++++++++++++++++++++++++++++++ include/trace/events/ext4.h | 15 +++++ 3 files changed, 141 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 67b0e2212ca0..27d4eff79941 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -314,7 +314,8 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, if (ret <= 0) goto out;
- if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP)) + if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP) && + iov_iter_count(from) > PAGE_SIZE) ret = ext4_iomap_buffered_write(iocb, from); else ret = generic_perform_write(iocb, from); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 93a9dd03cb5c..d6506c27f971 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3955,6 +3955,128 @@ static int ext4_iomap_writepages(struct address_space *mapping, return ret; }
+static int ext4_iomap_write_begin(struct file *file, + struct address_space *mapping, loff_t pos, + unsigned len, struct page **pagep, + void **fsdata) +{ + struct inode *inode = mapping->host; + struct iomap_iter iter = { + .inode = inode, + .flags = IOMAP_WRITE, + }; + int ret = 0, retries = 0; + struct folio *folio; + bool delalloc; + + if (unlikely(ext4_forced_shutdown(inode->i_sb))) + return -EIO; + + trace_ext4_iomap_write_begin(inode, pos, len); + + delalloc = test_opt(inode->i_sb, DELALLOC) && + !ext4_nonda_switch(inode->i_sb); + *fsdata = delalloc ? (void *)0 : (void *)FALL_BACK_TO_NONDELALLOC; + +retry: + iter.pos = pos; + iter.len = len; + + folio = iomap_get_folio(&iter, pos, len); + if (IS_ERR(folio)) + return PTR_ERR(folio); + + WARN_ON_ONCE(pos + len > folio_pos(folio) + folio_size(folio)); + + if (iomap_is_fully_dirty(folio, offset_in_folio(folio, pos), len)) + goto out; + + do { + int length; + + ret = __ext4_iomap_buffered_io_begin(inode, iter.pos, iter.len, + iter.flags, &iter.iomap, NULL, delalloc); + if (ret) + goto out; + + WARN_ON_ONCE(iter.iomap.offset > iter.pos); + WARN_ON_ONCE(iter.iomap.length == 0); + WARN_ON_ONCE(iter.iomap.offset + iter.iomap.length <= iter.pos); + + length = iomap_length(&iter); + ret = __iomap_write_begin(&iter, iter.pos, length, folio); + if (ret) + goto out; + + iter.pos += length; + iter.len -= length; + } while (iter.len); + +out: + if (ret < 0) { + folio_unlock(folio); + folio_put(folio); + + /* + * __ext4_iomap_buffered_io_begin() may have instantiated + * a few blocks outside i_size. Trim these off again. Don't + * need i_size_read because we hold inode lock. + */ + if (pos + len > inode->i_size) + ext4_truncate_failed_write(inode); + + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + } + + *pagep = folio_file_page(folio, pos >> PAGE_SHIFT); + return ret; +} + +static int ext4_iomap_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + int write_mode = (int)(unsigned long)fsdata; + struct folio *folio = page_folio(page); + loff_t old_size = inode->i_size; + size_t written; + + trace_ext4_iomap_write_end(inode, pos, len, copied); + + written = __iomap_write_end(inode, pos, len, copied, folio) ? + copied : 0; + + /* + * Update the in-memory inode size after copying the data into + * the page cache. It's important to update i_size while still + * holding folio lock, because folio writeout could otherwise + * come in and zero beyond i_size. + */ + if (pos + written > old_size) + i_size_write(inode, pos + written); + + folio_unlock(folio); + folio_put(folio); + + if (old_size < pos) + pagecache_isize_extended(inode, old_size, pos); + + /* + * For delalloc, if we have pre-allocated more blocks and copied + * less, we will have delalloc extents allocated outside i_size, + * drop pre-allocated blocks that were not used, prevent the + * write back path from allocating blocks for them. + */ + if (unlikely(!written) && write_mode != FALL_BACK_TO_NONDELALLOC) + ext4_truncate_failed_write(inode); + + return written; +} + /* * For data=journal mode, folio should be marked dirty only when it was * writeably mapped. When that happens, it was already attached to the @@ -4048,6 +4170,8 @@ static const struct address_space_operations ext4_iomap_aops = { .read_folio = ext4_iomap_read_folio, .readahead = ext4_iomap_readahead, .writepages = ext4_iomap_writepages, + .write_begin = ext4_iomap_write_begin, + .write_end = ext4_iomap_write_end, .dirty_folio = iomap_dirty_folio, .bmap = ext4_bmap, .invalidate_folio = iomap_invalidate_folio, diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 588991b57c12..d500568daeb1 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -389,6 +389,13 @@ DEFINE_EVENT(ext4__write_begin, ext4_da_write_begin, TP_ARGS(inode, pos, len) );
+DEFINE_EVENT(ext4__write_begin, ext4_iomap_write_begin, + + TP_PROTO(struct inode *inode, loff_t pos, unsigned int len), + + TP_ARGS(inode, pos, len) +); + DECLARE_EVENT_CLASS(ext4__write_end, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, unsigned int copied), @@ -441,6 +448,14 @@ DEFINE_EVENT(ext4__write_end, ext4_da_write_end, TP_ARGS(inode, pos, len, copied) );
+DEFINE_EVENT(ext4__write_end, ext4_iomap_write_end, + + TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, + unsigned int copied), + + TP_ARGS(inode, pos, len, copied) +); + TRACE_EVENT(ext4_writepages, TP_PROTO(struct inode *inode, struct writeback_control *wbc),