hulk inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I8TKTW CVE: NA
------------------------------------------
commit 9dc55f1389f9 ("iomap: add support for sub-pagesize buffered I/O without buffer heads") replace the per-block structure buffer_head with the per-page structure iomap_page. However, iomap_page can't track the dirty state of sub pages, which will cause performance issue since sub pages will be writeback even if they are not dirty.
For example, if block size is 4k and page size is 64k:
dd if=/dev/zero of=testfile bs=4k count=16 oflag=sync
With buffer_head implementation, the above dd cmd will writeback 4k in each round. However, with iomap_page implementation, the range of writeback in each round is from the start of the page to the end offset we just wrote.
Thus add support to track dirty state in iomap_page.
test environment: platform: arm64 pagesize: 64k blocksize: 4k
test case: dd if=/dev/zero of=/mnt/testfile bs=1M count=128 fio --ioengine=sync --rw=randwrite --iodepth=64 --name=test \ --filename=/mnt/testfile --bs=4k --fsync=1
The test result is: xfs with patch: WRITE: bw=4609KiB/s (4720kB/s) xfs without patch WRITE: bw=2714KiB/s (2780kB/s) ext4: WRITE: bw=3840KiB/s (3932kB/s)
Signed-off-by: Yu Kuai yukuai3@huawei.com --- fs/iomap.c | 70 ++++++++++++++++++++++++++++++++++++------- fs/xfs/xfs_aops.c | 3 +- include/linux/iomap.h | 15 ++++++++-- 3 files changed, 74 insertions(+), 14 deletions(-)
diff --git a/fs/iomap.c b/fs/iomap.c index 7078add7bbf9..d7234c7ac95e 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -121,8 +121,8 @@ iomap_page_create(struct inode *inode, struct page *page) iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL); atomic_set(&iop->read_count, 0); atomic_set(&iop->write_count, 0); - spin_lock_init(&iop->uptodate_lock); - bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE); + spin_lock_init(&iop->state_lock); + bitmap_zero(iop->state, IOMAP_STATE_ARRAY_SIZE);
/* * migrate_page_move_mapping() assumes that pages with private data have @@ -175,7 +175,7 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
/* move forward for each leading block marked uptodate */ for (i = first; i <= last; i++) { - if (!test_bit(i, iop->uptodate)) + if (!test_bit(i, iop->state)) break; *pos += block_size; poff += block_size; @@ -185,7 +185,7 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
/* truncate len if we find any trailing uptodate block(s) */ for ( ; i <= last; i++) { - if (test_bit(i, iop->uptodate)) { + if (test_bit(i, iop->state)) { plen -= (last - i + 1) * block_size; last = i - 1; break; @@ -209,6 +209,54 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, *lenp = plen; }
+static void +iomap_set_range_dirty(struct page *page, unsigned int off, + unsigned int len) +{ + struct inode *inode = page->mapping->host; + unsigned int first = DIRTY_BITS(off >> inode->i_blkbits); + unsigned int last = DIRTY_BITS((off + len - 1) >> inode->i_blkbits); + unsigned long flags; + struct iomap_page *iop; + + if (PageError(page)) + return; + + if (len) + iomap_set_page_dirty(page); + + if (!page_has_private(page)) + return; + + iop = to_iomap_page(page); + spin_lock_irqsave(&iop->state_lock, flags); + bitmap_set(iop->state, first, last - first + 1); + spin_unlock_irqrestore(&iop->state_lock, flags); +} + +void +iomap_clear_range_dirty(struct page *page, unsigned int off, + unsigned int len) +{ + struct inode *inode = page->mapping->host; + unsigned int first = DIRTY_BITS(off >> inode->i_blkbits); + unsigned int last = DIRTY_BITS((off + len - 1) >> inode->i_blkbits); + unsigned long flags; + struct iomap_page *iop; + + if (PageError(page)) + return; + + if (!page_has_private(page)) + return; + + iop = to_iomap_page(page); + spin_lock_irqsave(&iop->state_lock, flags); + bitmap_clear(iop->state, first, last - first + 1); + spin_unlock_irqrestore(&iop->state_lock, flags); +} +EXPORT_SYMBOL_GPL(iomap_clear_range_dirty); + static void iomap_iop_set_range_uptodate(struct page *page, unsigned off, unsigned len) { @@ -220,17 +268,17 @@ iomap_iop_set_range_uptodate(struct page *page, unsigned off, unsigned len) unsigned long flags; unsigned int i;
- spin_lock_irqsave(&iop->uptodate_lock, flags); + spin_lock_irqsave(&iop->state_lock, flags); for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) { if (i >= first && i <= last) - set_bit(i, iop->uptodate); - else if (!test_bit(i, iop->uptodate)) + set_bit(i, iop->state); + else if (!test_bit(i, iop->state)) uptodate = false; }
if (uptodate) SetPageUptodate(page); - spin_unlock_irqrestore(&iop->uptodate_lock, flags); + spin_unlock_irqrestore(&iop->state_lock, flags); }
static void @@ -544,7 +592,7 @@ iomap_is_partially_uptodate(struct page *page, unsigned long from,
if (iop) { for (i = first; i <= last; i++) - if (!test_bit(i, iop->uptodate)) + if (!test_bit(i, iop->state)) return 0; return 1; } @@ -760,7 +808,7 @@ __iomap_write_end(struct inode *inode, loff_t pos, unsigned len, if (unlikely(copied < len && !PageUptodate(page))) return 0; iomap_set_range_uptodate(page, offset_in_page(pos), len); - iomap_set_page_dirty(page); + iomap_set_range_dirty(page, offset_in_page(pos), len); return copied; }
@@ -1096,7 +1144,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, } else { WARN_ON_ONCE(!PageUptodate(page)); iomap_page_create(inode, page); - set_page_dirty(page); + iomap_set_range_dirty(page, offset_in_page(pos), length); }
return length; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 7a6396bd8114..554c74726552 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -751,7 +751,7 @@ xfs_writepage_map( for (i = 0, file_offset = page_offset(page); i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset; i++, file_offset += len) { - if (iop && !test_bit(i, iop->uptodate)) + if (iop && !test_bit(DIRTY_BITS(i), iop->state)) continue;
error = xfs_map_blocks(wpc, inode, file_offset); @@ -800,6 +800,7 @@ xfs_writepage_map( */ set_page_writeback_keepwrite(page); } else { + iomap_clear_range_dirty(page, 0, PAGE_SIZE); clear_page_dirty_for_io(page); set_page_writeback(page); } diff --git a/include/linux/iomap.h b/include/linux/iomap.h index eed29ae891b1..4ce1290ac0c3 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -7,6 +7,7 @@ #include <linux/bitmap.h> #include <linux/mm.h> #include <linux/types.h> +#include <linux/blkdev.h>
struct address_space; struct fiemap_extent_info; @@ -53,6 +54,9 @@ struct vm_fault; */ #define IOMAP_NULL_ADDR -1ULL /* addr is not valid */
+#define DIRTY_BITS(x) ((x) + PAGE_SIZE / SECTOR_SIZE) +#define IOMAP_STATE_ARRAY_SIZE (PAGE_SIZE * 2 / SECTOR_SIZE) + struct iomap { u64 addr; /* disk offset of mapping, bytes */ loff_t offset; /* file offset of mapping, bytes */ @@ -114,8 +118,12 @@ struct iomap_ops { struct iomap_page { atomic_t read_count; atomic_t write_count; - spinlock_t uptodate_lock; - DECLARE_BITMAP(uptodate, PAGE_SIZE / 512); + spinlock_t state_lock; + /* + * The first half bits are used to track sub-page uptodate status, + * the second half bits are for dirty status. + */ + DECLARE_BITMAP(state, IOMAP_STATE_ARRAY_SIZE); };
static inline struct iomap_page *to_iomap_page(struct page *page) @@ -136,6 +144,9 @@ int iomap_is_partially_uptodate(struct page *page, unsigned long from, int iomap_releasepage(struct page *page, gfp_t gfp_mask); void iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len); +void iomap_clear_range_dirty(struct page *page, unsigned int off, + unsigned int len); + #ifdef CONFIG_MIGRATION int iomap_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode);