From: Zhihao Cheng chengzhihao1@huawei.com
mainline inclusion from mainline-5.19-rc1 commit 68f4c6eba70df70a720188bce95c85570ddfcc87 category: bugfix bugzilla: 186540, https://gitee.com/openeuler/kernel/issues/I55AKK CVE: NA
--------------------------------
Commit 505a666ee3fc ("writeback: plug writeback in wb_writeback() and writeback_inodes_wb()") has us holding a plug during wb_writeback, which may cause a potential ABBA dead lock:
wb_writeback fat_file_fsync blk_start_plug(&plug) for (;;) { iter i-1: some reqs have been added into plug->mq_list // LOCK A iter i: progress = __writeback_inodes_wb(wb, work) . writeback_sb_inodes // fat's bdev . __writeback_single_inode . . generic_writepages . . __block_write_full_page . . . . __generic_file_fsync . . . . sync_inode_metadata . . . . writeback_single_inode . . . . __writeback_single_inode . . . . fat_write_inode . . . . __fat_write_inode . . . . sync_dirty_buffer // fat's bdev . . . . lock_buffer(bh) // LOCK B . . . . submit_bh . . . . blk_mq_get_tag // LOCK A . . . trylock_buffer(bh) // LOCK B . . . redirty_page_for_writepage . . . wbc->pages_skipped++ . . --wbc->nr_to_write . wrote += write_chunk - wbc.nr_to_write // wrote > 0 . requeue_inode . redirty_tail_locked if (progress) // progress > 0 continue; iter i+1: queue_io // similar process with iter i, infinite for-loop ! } blk_finish_plug(&plug) // flush plug won't be called
Above process triggers a hungtask like: [ 399.044861] INFO: task bb:2607 blocked for more than 30 seconds. [ 399.046824] Not tainted 5.18.0-rc1-00005-gefae4d9eb6a2-dirty [ 399.051539] task:bb state:D stack: 0 pid: 2607 ppid: 2426 flags:0x00004000 [ 399.051556] Call Trace: [ 399.051570] __schedule+0x480/0x1050 [ 399.051592] schedule+0x92/0x1a0 [ 399.051602] io_schedule+0x22/0x50 [ 399.051613] blk_mq_get_tag+0x1d3/0x3c0 [ 399.051640] __blk_mq_alloc_requests+0x21d/0x3f0 [ 399.051657] blk_mq_submit_bio+0x68d/0xca0 [ 399.051674] __submit_bio+0x1b5/0x2d0 [ 399.051708] submit_bio_noacct+0x34e/0x720 [ 399.051718] submit_bio+0x3b/0x150 [ 399.051725] submit_bh_wbc+0x161/0x230 [ 399.051734] __sync_dirty_buffer+0xd1/0x420 [ 399.051744] sync_dirty_buffer+0x17/0x20 [ 399.051750] __fat_write_inode+0x289/0x310 [ 399.051766] fat_write_inode+0x2a/0xa0 [ 399.051783] __writeback_single_inode+0x53c/0x6f0 [ 399.051795] writeback_single_inode+0x145/0x200 [ 399.051803] sync_inode_metadata+0x45/0x70 [ 399.051856] __generic_file_fsync+0xa3/0x150 [ 399.051880] fat_file_fsync+0x1d/0x80 [ 399.051895] vfs_fsync_range+0x40/0xb0 [ 399.051929] __x64_sys_fsync+0x18/0x30
In my test, 'need_resched()' (which is imported by 590dca3a71 "fs-writeback: unplug before cond_resched in writeback_sb_inodes") in function 'writeback_sb_inodes()' seldom comes true, unless cond_resched() is deleted from write_cache_pages().
Fix it by correcting wrote number according number of skipped pages in writeback_sb_inodes().
Goto Link to find a reproducer.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=215837 Cc: stable@vger.kernel.org # v4.3 Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Jan Kara jack@suse.cz Reviewed-by: Christoph Hellwig hch@lst.de Link: https://lore.kernel.org/r/20220510133805.1988292-1-chengzhihao1@huawei.com Signed-off-by: Jens Axboe axboe@kernel.dk Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- fs/fs-writeback.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 5abb71da2b9a..23a632f02839 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1568,11 +1568,12 @@ static long writeback_sb_inodes(struct super_block *sb, }; unsigned long start_time = jiffies; long write_chunk; - long wrote = 0; /* count both pages and inodes */ + long total_wrote = 0; /* count both pages and inodes */
while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); struct bdi_writeback *tmp_wb; + long wrote;
if (inode->i_sb != sb) { if (work->sb) { @@ -1648,7 +1649,9 @@ static long writeback_sb_inodes(struct super_block *sb,
wbc_detach_inode(&wbc); work->nr_pages -= write_chunk - wbc.nr_to_write; - wrote += write_chunk - wbc.nr_to_write; + wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped; + wrote = wrote < 0 ? 0 : wrote; + total_wrote += wrote;
if (need_resched()) { /* @@ -1670,7 +1673,7 @@ static long writeback_sb_inodes(struct super_block *sb, tmp_wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); if (!(inode->i_state & I_DIRTY_ALL)) - wrote++; + total_wrote++; requeue_inode(inode, tmp_wb, &wbc); inode_sync_complete(inode); spin_unlock(&inode->i_lock); @@ -1684,14 +1687,14 @@ static long writeback_sb_inodes(struct super_block *sb, * bail out to wb_writeback() often enough to check * background threshold and other termination conditions. */ - if (wrote) { + if (total_wrote) { if (time_is_before_jiffies(start_time + HZ / 10UL)) break; if (work->nr_pages <= 0) break; } } - return wrote; + return total_wrote; }
static long __writeback_inodes_wb(struct bdi_writeback *wb,
From: Konstantin Khlebnikov khlebnikov@yandex-team.ru
mainline inclusion from mainline-v5.7-rc1 commit 2b8bd423614c595540eaadcfbc702afe8e155e50 category: bugfix bugzilla: 187044, https://gitee.com/openeuler/kernel/issues/I5F2BY CVE: NA
--------------------------------
Currently io_ticks is approximated by adding one at each start and end of requests if jiffies counter has changed. This works perfectly for requests shorter than a jiffy or if one of requests starts/ends at each jiffy.
If disk executes just one request at a time and they are longer than two jiffies then only first and last jiffies will be accounted.
Fix is simple: at the end of request add up into io_ticks jiffies passed since last update rather than just one jiffy.
Example: common HDD executes random read 4k requests around 12ms.
fio --name=test --filename=/dev/sdb --rw=randread --direct=1 --runtime=30 & iostat -x 10 sdb
Note changes of iostat's "%util" 8,43% -> 99,99% before/after patch:
Before:
Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util sdb 0,00 0,00 82,60 0,00 330,40 0,00 8,00 0,96 12,09 12,09 0,00 1,02 8,43
After:
Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util sdb 0,00 0,00 82,50 0,00 330,00 0,00 8,00 1,00 12,10 12,10 0,00 12,12 99,99
Now io_ticks does not loose time between start and end of requests, but for queue-depth > 1 some I/O time between adjacent starts might be lost.
For load estimation "%util" is not as useful as average queue length, but it clearly shows how often disk queue is completely empty.
Fixes: 5b18b5a73760 ("block: delete part_round_stats and switch to less precise counting") Signed-off-by: Konstantin Khlebnikov khlebnikov@yandex-team.ru Reviewed-by: Ming Lei ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk
Conflict: block/bio.c block/blk-core.c include/linux/genhd.h Signed-off-by: Zhang Wensheng zhangwensheng5@huawei.com Reviewed-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- block/bio.c | 6 +++--- block/blk-core.c | 2 +- include/linux/genhd.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/block/bio.c b/block/bio.c index 6457cbfa70cc..b5bbc023d64d 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1668,14 +1668,14 @@ void bio_check_pages_dirty(struct bio *bio) } EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
-void update_io_ticks(int cpu, struct hd_struct *part, unsigned long now) +void update_io_ticks(int cpu, struct hd_struct *part, unsigned long now, bool end) { unsigned long stamp; again: stamp = READ_ONCE(part->stamp); if (unlikely(stamp != now)) { if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) - __part_stat_add(cpu, part, io_ticks, now - stamp); + __part_stat_add(cpu, part, io_ticks, end ? now - stamp : 1); } if (part->partno) { part = &part_to_disk(part)->part0; @@ -1709,7 +1709,7 @@ void generic_end_io_acct(struct request_queue *q, int req_op, if (precise_iostat) { part_round_stats(q, cpu, part); } else { - update_io_ticks(cpu, part, now); + update_io_ticks(cpu, part, now, true); part_stat_add(cpu, part, time_in_queue, duration); } part_stat_add(cpu, part, nsecs[sgrp], jiffies_to_nsecs(duration)); diff --git a/block/blk-core.c b/block/blk-core.c index a5d80ab91170..5892c532ae5b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2809,7 +2809,7 @@ void blk_account_io_done(struct request *req, u64 now) part = req->part;
if (!precise_iostat) { - update_io_ticks(cpu, part, jiffies); + update_io_ticks(cpu, part, jiffies, true); part_stat_add(cpu, part, time_in_queue, nsecs_to_jiffies64(now - req->start_time_ns)); } else { diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 264326ae5a90..58a819484fb4 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -427,7 +427,7 @@ static inline void free_part_info(struct hd_struct *part)
/* block/blk-core.c */ extern void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part); -void update_io_ticks(int cpu, struct hd_struct *part, unsigned long now); +void update_io_ticks(int cpu, struct hd_struct *part, unsigned long now, bool end);
/* block/genhd.c */ extern void device_add_disk(struct device *parent, struct gendisk *disk);
From: Zhang Wensheng zhangwensheng5@huawei.com
hulk inclusion category: bugfix bugzilla: 187044, https://gitee.com/openeuler/kernel/issues/I5F2BY CVE: NA
--------------------------------
There is io path like: blk_mq_make_request blk_mq_bio_to_request blk_account_io_start part_round_stats part_in_flight /* controlled by precise_iostat, it also can calculated by atomic accumulation */ blk_mq_in_flight blk_mq_queue_tag_busy_iter bt_for_each blk_mq_find_and_get_req blk_mq_tags_lock_irqsave ... blk_mq_tags_unlock_irqrestore As we can see, there is a unnecessary locking operation in io path which will affect concurrency performance. This problem was introduced by "part_round_stats" and "part_in_flight" which using tag to account inflight.
Fix it by using "precise_iostat" too, moving part_round_stats into "precise_iostat", when "precise_iostat" is on, the iostat is accurate and using atomic accumulation to account inflight. when it is off, the io iostat will not be accurate.
Signed-off-by: Zhang Wensheng zhangwensheng5@huawei.com Reviewed-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- block/bio.c | 5 ++++- block/blk-core.c | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/block/bio.c b/block/bio.c index b5bbc023d64d..48092fe0c116 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1689,7 +1689,10 @@ void generic_start_io_acct(struct request_queue *q, int op, const int sgrp = op_stat_group(op); int cpu = part_stat_lock();
- part_round_stats(q, cpu, part); + if (precise_iostat) + part_round_stats(q, cpu, part); + else + update_io_ticks(cpu, part, jiffies, false); part_stat_inc(cpu, part, ios[sgrp]); part_stat_add(cpu, part, sectors[sgrp], sectors); part_inc_in_flight(q, part, op_is_write(op)); diff --git a/block/blk-core.c b/block/blk-core.c index 5892c532ae5b..219aee53a1be 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2864,7 +2864,10 @@ void blk_account_io_start(struct request *rq, bool new_io) part_stat_inc(cpu, part, merges[rw]); } else { part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); - part_round_stats(rq->q, cpu, part); + if (!precise_iostat) + update_io_ticks(cpu, part, jiffies, false); + else + part_round_stats(rq->q, cpu, part); part_inc_in_flight(rq->q, part, rw); rq->part = part; }
From: Zhang Wensheng zhangwensheng5@huawei.com
hulk inclusion category: bugfix bugzilla: 187044, https://gitee.com/openeuler/kernel/issues/I5F2BY CVE: NA
--------------------------------
Open accurate iostat account by default with setting the "precise_iostat = 1" when initializing.
Signed-off-by: Zhang Wensheng zhangwensheng5@huawei.com Reviewed-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- block/blk-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/block/blk-core.c b/block/blk-core.c index 219aee53a1be..bced58322fcc 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -56,7 +56,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
DEFINE_IDA(blk_queue_ida);
-bool precise_iostat; +bool precise_iostat = true; static int __init precise_iostat_setup(char *str) { bool precise;