[PATCH openEuler-22.03-LTS-SP1 2/3] md: Remove flush handling

12 Oct 2024

From: Yu Kuai <yukuai3@huawei.com>

maillist inclusion
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/IAKQB5
CVE: CVE-2024-43855

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?...

--------------------------------

For flush request, md has a special flush handling to merge concurrent
flush request into single one, however, the whole mechanism is based on
a disk level spin_lock 'mddev->lock'. And fsync can be called quite
often in some user cases, for consequence, spin lock from IO fast path can
cause performance degradation.

Fortunately, the block layer already has flush handling to merge
concurrent flush request, and it only acquires hctx level spin lock. (see
details in blk-flush.c)

This patch removes the flush handling in md, and converts to use general
block layer flush handling in underlying disks.

Flush test for 4 nvme raid10:
start 128 threads to do fsync 100000 times, on arm64, see how long it
takes.

Test script:
void* thread_func(void* arg) {
    int fd = *(int*)arg;
    for (int i = 0; i < FSYNC_COUNT; i++) {
        fsync(fd);
    }
    return NULL;
}

int main() {
    int fd = open("/dev/md0", O_RDWR);
    if (fd < 0) {
        perror("open");
        exit(1);
    }

    pthread_t threads[THREADS];
    struct timeval start, end;

    gettimeofday(&start, NULL);

    for (int i = 0; i < THREADS; i++) {
        pthread_create(&threads[i], NULL, thread_func, &fd);
    }

    for (int i = 0; i < THREADS; i++) {
        pthread_join(threads[i], NULL);
    }

    gettimeofday(&end, NULL);

    close(fd);

    long long elapsed = (end.tv_sec - start.tv_sec) * 1000000LL + (end.tv_usec - start.tv_usec);
    printf("Elapsed time: %lld microseconds\n", elapsed);

    return 0;
}

Test result: about 10 times faster:
Before this patch: 50943374 microseconds
After this patch:  5096347  microseconds

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20240827110616.3860190-1-yukuai1@huaweicloud.com
Signed-off-by: Song Liu <song@kernel.org>
Conflicts:
	drivers/md/md.c
[ There is a lot of conflict, we just delete the same function as origin
patch. In the new function, the differences from mainline are:
  1. remove the check of 'mddev->active_io'.
  2. use rdev_for_each_rcu() and add rcu lock.
  3. change the way of alloc 'new'. ]
Signed-off-by: Li Nan <linan122@huawei.com>
---
 drivers/md/md.h |  10 ----
 drivers/md/md.c | 127 +++++++-----------------------------------------
 2 files changed, 18 insertions(+), 119 deletions(-)

diff --git a/drivers/md/md.h b/drivers/md/md.h
index 6eba883eddd6..e13d3654aef5 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -508,16 +508,6 @@ struct mddev {
 						   */
 	struct bio_set			io_acct_set; /* for raid0 and raid5 io accounting */
 
-	/* Generic flush handling.
-	 * The last to finish preflush schedules a worker to submit
-	 * the rest of the request (without the REQ_PREFLUSH flag).
-	 */
-	struct bio *flush_bio;
-	atomic_t flush_pending;
-	ktime_t start_flush, last_flush; /* last_flush is when the last completed
-					  * flush was started.
-					  */
-	struct work_struct flush_work;
 	struct work_struct event_work;	/* used by dm to report failure event */
 	mempool_t *serial_info_pool;
 	void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0292eb1c8363..94ea0de3f06f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -536,123 +536,33 @@ void mddev_resume(struct mddev *mddev)
 }
 EXPORT_SYMBOL_GPL(mddev_resume);
 
-/*
- * Generic flush handling for md
- */
-
-static void md_end_flush(struct bio *bio)
-{
-	struct md_rdev *rdev = bio->bi_private;
-	struct mddev *mddev = rdev->mddev;
-
-	bio_put(bio);
-
-	rdev_dec_pending(rdev, mddev);
-
-	if (atomic_dec_and_test(&mddev->flush_pending)) {
-		/* The pre-request flush has finished */
-		queue_work(md_wq, &mddev->flush_work);
-	}
-}
-
-static void md_submit_flush_data(struct work_struct *ws);
-
-static void submit_flushes(struct work_struct *ws)
+bool md_flush_request(struct mddev *mddev, struct bio *bio)
 {
-	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
 	struct md_rdev *rdev;
+	struct bio *new;
 
-	mddev->start_flush = ktime_get_boottime();
-	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
-	atomic_set(&mddev->flush_pending, 1);
 	rcu_read_lock();
-	rdev_for_each_rcu(rdev, mddev)
-		if (rdev->raid_disk >= 0 &&
-		    !test_bit(Faulty, &rdev->flags)) {
-			/* Take two references, one is dropped
-			 * when request finishes, one after
-			 * we reclaim rcu_read_lock
-			 */
-			struct bio *bi;
-			atomic_inc(&rdev->nr_pending);
-			atomic_inc(&rdev->nr_pending);
-			rcu_read_unlock();
-			bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
-			bi->bi_end_io = md_end_flush;
-			bi->bi_private = rdev;
-			bio_set_dev(bi, rdev->bdev);
-			bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
-			atomic_inc(&mddev->flush_pending);
-			submit_bio(bi);
-			rcu_read_lock();
-			rdev_dec_pending(rdev, mddev);
-		}
-	rcu_read_unlock();
-	if (atomic_dec_and_test(&mddev->flush_pending))
-		queue_work(md_wq, &mddev->flush_work);
-}
-
-static void md_submit_flush_data(struct work_struct *ws)
-{
-	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
-	struct bio *bio = mddev->flush_bio;
+	rdev_for_each_rcu(rdev, mddev) {
+		if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
+			continue;
 
-	/*
-	 * must reset flush_bio before calling into md_handle_request to avoid a
-	 * deadlock, because other bios passed md_handle_request suspend check
-	 * could wait for this and below md_handle_request could wait for those
-	 * bios because of suspend check
-	 */
-	spin_lock_irq(&mddev->lock);
-	mddev->last_flush = mddev->start_flush;
-	mddev->flush_bio = NULL;
-	spin_unlock_irq(&mddev->lock);
-	wake_up(&mddev->sb_wait);
+		rcu_read_unlock();
+		new = bio_alloc_mddev(GFP_NOIO, 0, mddev);
+		bio_set_dev(new, rdev->bdev);
+		new->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+		bio_chain(new, bio);
+		submit_bio(new);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
 
-	if (bio->bi_iter.bi_size == 0) {
-		/* an empty barrier - all done */
+	if (bio_sectors(bio) == 0) {
 		bio_endio(bio);
-	} else {
-		bio->bi_opf &= ~REQ_PREFLUSH;
-		md_handle_request(mddev, bio);
+		return true;
 	}
-}
 
-/*
- * Manages consolidation of flushes and submitting any flushes needed for
- * a bio with REQ_PREFLUSH.  Returns true if the bio is finished or is
- * being finished in another context.  Returns false if the flushing is
- * complete but still needs the I/O portion of the bio to be processed.
- */
-bool md_flush_request(struct mddev *mddev, struct bio *bio)
-{
-	ktime_t start = ktime_get_boottime();
-	spin_lock_irq(&mddev->lock);
-	wait_event_lock_irq(mddev->sb_wait,
-			    !mddev->flush_bio ||
-			    ktime_after(mddev->last_flush, start),
-			    mddev->lock);
-	if (!ktime_after(mddev->last_flush, start)) {
-		WARN_ON(mddev->flush_bio);
-		mddev->flush_bio = bio;
-		bio = NULL;
-	}
-	spin_unlock_irq(&mddev->lock);
-
-	if (!bio) {
-		INIT_WORK(&mddev->flush_work, submit_flushes);
-		queue_work(md_wq, &mddev->flush_work);
-	} else {
-		/* flush was performed for some other bio while we waited. */
-		if (bio->bi_iter.bi_size == 0)
-			/* an empty barrier - all done */
-			bio_endio(bio);
-		else {
-			bio->bi_opf &= ~REQ_PREFLUSH;
-			return false;
-		}
-	}
-	return true;
+	bio->bi_opf &= ~REQ_PREFLUSH;
+	return false;
 }
 EXPORT_SYMBOL(md_flush_request);
 
@@ -701,7 +611,6 @@ void mddev_init(struct mddev *mddev)
 	atomic_set(&mddev->active_io, 0);
 	atomic_set(&mddev->sync_seq, 0);
 	spin_lock_init(&mddev->lock);
-	atomic_set(&mddev->flush_pending, 0);
 	init_waitqueue_head(&mddev->sb_wait);
 	init_waitqueue_head(&mddev->recovery_wait);
 	mddev->reshape_position = MaxSector;
-- 
2.39.2