Li Nan (1): md: ensure child flush IO does not affect origin bio->bi_status
Yu Kuai (1): md: Remove flush handling
drivers/md/md.h | 10 ---- drivers/md/md.c | 134 +++++++++++------------------------------------- 2 files changed, 31 insertions(+), 113 deletions(-)
From: Yu Kuai yukuai3@huawei.com
maillist inclusion category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IAKQB5 CVE: CVE-2024-43855
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?...
--------------------------------
For flush request, md has a special flush handling to merge concurrent flush request into single one, however, the whole mechanism is based on a disk level spin_lock 'mddev->lock'. And fsync can be called quite often in some user cases, for consequence, spin lock from IO fast path can cause performance degradation.
Fortunately, the block layer already has flush handling to merge concurrent flush request, and it only acquires hctx level spin lock. (see details in blk-flush.c)
This patch removes the flush handling in md, and converts to use general block layer flush handling in underlying disks.
Flush test for 4 nvme raid10: start 128 threads to do fsync 100000 times, on arm64, see how long it takes.
Test script: void* thread_func(void* arg) { int fd = *(int*)arg; for (int i = 0; i < FSYNC_COUNT; i++) { fsync(fd); } return NULL; }
int main() { int fd = open("/dev/md0", O_RDWR); if (fd < 0) { perror("open"); exit(1); }
pthread_t threads[THREADS]; struct timeval start, end;
gettimeofday(&start, NULL);
for (int i = 0; i < THREADS; i++) { pthread_create(&threads[i], NULL, thread_func, &fd); }
for (int i = 0; i < THREADS; i++) { pthread_join(threads[i], NULL); }
gettimeofday(&end, NULL);
close(fd);
long long elapsed = (end.tv_sec - start.tv_sec) * 1000000LL + (end.tv_usec - start.tv_usec); printf("Elapsed time: %lld microseconds\n", elapsed);
return 0; }
Test result: about 10 times faster: Before this patch: 50943374 microseconds After this patch: 5096347 microseconds
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Yu Kuai yukuai3@huawei.com Link: https://lore.kernel.org/r/20240827110616.3860190-1-yukuai1@huaweicloud.com Signed-off-by: Song Liu song@kernel.org Conflicts: drivers/md/md.c [ There is a lot of conflict, we just delete the same function as origin patch. In the new function, the differences from mainline are: 1. remove the check of 'mddev->active_io'. 2. use rdev_for_each_rcu() and add rcu lock. 3. change the way of alloc 'new'. ] Signed-off-by: Li Nan linan122@huawei.com --- drivers/md/md.h | 10 ---- drivers/md/md.c | 127 +++++++----------------------------------------- 2 files changed, 18 insertions(+), 119 deletions(-)
diff --git a/drivers/md/md.h b/drivers/md/md.h index 6eba883eddd6..e13d3654aef5 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -508,16 +508,6 @@ struct mddev { */ struct bio_set io_acct_set; /* for raid0 and raid5 io accounting */
- /* Generic flush handling. - * The last to finish preflush schedules a worker to submit - * the rest of the request (without the REQ_PREFLUSH flag). - */ - struct bio *flush_bio; - atomic_t flush_pending; - ktime_t start_flush, last_flush; /* last_flush is when the last completed - * flush was started. - */ - struct work_struct flush_work; struct work_struct event_work; /* used by dm to report failure event */ mempool_t *serial_info_pool; void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); diff --git a/drivers/md/md.c b/drivers/md/md.c index 897bb0aad180..8f75b28d49e9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -536,123 +536,33 @@ void mddev_resume(struct mddev *mddev) } EXPORT_SYMBOL_GPL(mddev_resume);
-/* - * Generic flush handling for md - */ - -static void md_end_flush(struct bio *bio) -{ - struct md_rdev *rdev = bio->bi_private; - struct mddev *mddev = rdev->mddev; - - bio_put(bio); - - rdev_dec_pending(rdev, mddev); - - if (atomic_dec_and_test(&mddev->flush_pending)) { - /* The pre-request flush has finished */ - queue_work(md_wq, &mddev->flush_work); - } -} - -static void md_submit_flush_data(struct work_struct *ws); - -static void submit_flushes(struct work_struct *ws) +bool md_flush_request(struct mddev *mddev, struct bio *bio) { - struct mddev *mddev = container_of(ws, struct mddev, flush_work); struct md_rdev *rdev; + struct bio *new;
- mddev->start_flush = ktime_get_boottime(); - INIT_WORK(&mddev->flush_work, md_submit_flush_data); - atomic_set(&mddev->flush_pending, 1); rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (rdev->raid_disk >= 0 && - !test_bit(Faulty, &rdev->flags)) { - /* Take two references, one is dropped - * when request finishes, one after - * we reclaim rcu_read_lock - */ - struct bio *bi; - atomic_inc(&rdev->nr_pending); - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); - bi->bi_end_io = md_end_flush; - bi->bi_private = rdev; - bio_set_dev(bi, rdev->bdev); - bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; - atomic_inc(&mddev->flush_pending); - submit_bio(bi); - rcu_read_lock(); - rdev_dec_pending(rdev, mddev); - } - rcu_read_unlock(); - if (atomic_dec_and_test(&mddev->flush_pending)) - queue_work(md_wq, &mddev->flush_work); -} - -static void md_submit_flush_data(struct work_struct *ws) -{ - struct mddev *mddev = container_of(ws, struct mddev, flush_work); - struct bio *bio = mddev->flush_bio; + rdev_for_each_rcu(rdev, mddev) { + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) + continue;
- /* - * must reset flush_bio before calling into md_handle_request to avoid a - * deadlock, because other bios passed md_handle_request suspend check - * could wait for this and below md_handle_request could wait for those - * bios because of suspend check - */ - spin_lock_irq(&mddev->lock); - mddev->last_flush = mddev->start_flush; - mddev->flush_bio = NULL; - spin_unlock_irq(&mddev->lock); - wake_up(&mddev->sb_wait); + rcu_read_unlock(); + new = bio_alloc_mddev(GFP_NOIO, 0, mddev); + bio_set_dev(new, rdev->bdev); + new->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; + bio_chain(new, bio); + submit_bio(new); + rcu_read_lock(); + } + rcu_read_unlock();
- if (bio->bi_iter.bi_size == 0) { - /* an empty barrier - all done */ + if (bio_sectors(bio) == 0) { bio_endio(bio); - } else { - bio->bi_opf &= ~REQ_PREFLUSH; - md_handle_request(mddev, bio); + return true; } -}
-/* - * Manages consolidation of flushes and submitting any flushes needed for - * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is - * being finished in another context. Returns false if the flushing is - * complete but still needs the I/O portion of the bio to be processed. - */ -bool md_flush_request(struct mddev *mddev, struct bio *bio) -{ - ktime_t start = ktime_get_boottime(); - spin_lock_irq(&mddev->lock); - wait_event_lock_irq(mddev->sb_wait, - !mddev->flush_bio || - ktime_after(mddev->last_flush, start), - mddev->lock); - if (!ktime_after(mddev->last_flush, start)) { - WARN_ON(mddev->flush_bio); - mddev->flush_bio = bio; - bio = NULL; - } - spin_unlock_irq(&mddev->lock); - - if (!bio) { - INIT_WORK(&mddev->flush_work, submit_flushes); - queue_work(md_wq, &mddev->flush_work); - } else { - /* flush was performed for some other bio while we waited. */ - if (bio->bi_iter.bi_size == 0) - /* an empty barrier - all done */ - bio_endio(bio); - else { - bio->bi_opf &= ~REQ_PREFLUSH; - return false; - } - } - return true; + bio->bi_opf &= ~REQ_PREFLUSH; + return false; } EXPORT_SYMBOL(md_flush_request);
@@ -701,7 +611,6 @@ void mddev_init(struct mddev *mddev) atomic_set(&mddev->active_io, 0); atomic_set(&mddev->sync_seq, 0); spin_lock_init(&mddev->lock); - atomic_set(&mddev->flush_pending, 0); init_waitqueue_head(&mddev->sb_wait); init_waitqueue_head(&mddev->recovery_wait); mddev->reshape_position = MaxSector;
hulk inclusion category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IAKQB5 CVE: CVE-2024-43855
--------------------------------
When a flush is issued to an RAID array, a child flush IO is created and issued for each member disk in the RAID array. Since patch b75197e86e6d ("md: Remove flush handling"), each child flush IO has been chained with the original bio. As a result, the failure of any child IO could modify the bi_status of the original bio, potentially impacting the upper-layer filesystem.
Fix the issue by preventing child flush IO from altering the original bio->bi_status as before. However, this design introduces a known issue: in the event of a power failure, if a flush IO on a member disk fails, the upper layers may not be informed. This issue will be fixed in a future patch.
Fixes: b75197e86e6d ("md: Remove flush handling") Signed-off-by: Li Nan linan122@huawei.com --- drivers/md/md.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c index 8f75b28d49e9..a4d03d7f571a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -536,6 +536,23 @@ void mddev_resume(struct mddev *mddev) } EXPORT_SYMBOL_GPL(mddev_resume);
+static void md_end_flush(struct bio *bio) +{ + struct bio *parent = bio->bi_private; + char b[BDEVNAME_SIZE]; + + /* + * If any flush io error before the power failure, + * disk data may be lost. + */ + if (bio->bi_status) + pr_err("md: %s flush io error %d\n", bio_devname(bio, b), + blk_status_to_errno(bio->bi_status)); + + bio_put(bio); + bio_endio(parent); +} + bool md_flush_request(struct mddev *mddev, struct bio *bio) { struct md_rdev *rdev; @@ -550,7 +567,9 @@ bool md_flush_request(struct mddev *mddev, struct bio *bio) new = bio_alloc_mddev(GFP_NOIO, 0, mddev); bio_set_dev(new, rdev->bdev); new->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; - bio_chain(new, bio); + new->bi_private = bio; + new->bi_end_io = md_end_flush; + bio_inc_remaining(bio); submit_bio(new); rcu_read_lock(); }
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/12121 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/E...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/12121 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/E...