[PATCH openEuler-1.0-LTS 2/2] blk-mq: Improve performance of non-mq IO schedulers with multiple HW queues

5 Sep 2023

From: Jan Kara jack@suse.cz
mainline inclusion
from mainline-v5.12-rc2
commit b6e68ee82585f2ee890b0a897a6aacbf49a467bb
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I7QFJD?from=project-issue
CVE: NA
--------------------------------
Currently when non-mq aware IO scheduler (BFQ, mq-deadline) is used for
a queue with multiple HW queues, the performance it rather bad. The
problem is that these IO schedulers use queue-wide locking and their
dispatch function does not respect the hctx it is passed in and returns
any request it finds appropriate. Thus locality of request access is
broken and dispatch from multiple CPUs just contends on IO scheduler
locks. For these IO schedulers there's little point in dispatching from
multiple CPUs. Instead dispatch always only from a single CPU to limit
contention.
Below is a comparison of dbench runs on XFS filesystem where the storage
is a raid card with 64 HW queues and to it attached a single rotating
disk. BFQ is used as IO scheduler:
clients           MQ                     SQ             MQ-Patched
Amean 1      39.12 (0.00%)       43.29 * -10.67%*       36.09 *   7.74%*
Amean 2     128.58 (0.00%)      101.30 *  21.22%*       96.14 *  25.23%*
Amean 4     577.42 (0.00%)      494.47 *  14.37%*      508.49 *  11.94%*
Amean 8     610.95 (0.00%)      363.86 *  40.44%*      362.12 *  40.73%*
Amean 16    391.78 (0.00%)      261.49 *  33.25%*      282.94 *  27.78%*
Amean 32    324.64 (0.00%)      267.71 *  17.54%*      233.00 *  28.23%*
Amean 64    295.04 (0.00%)      253.02 *  14.24%*      242.37 *  17.85%*
Amean 512 10281.61 (0.00%)    10211.16 *   0.69%*    10447.53 *  -1.61%*
Numbers are times so lower is better. MQ is stock 5.10-rc6 kernel. SQ is
the same kernel with megaraid_sas.host_tagset_enable=0 so that the card
advertises just a single HW queue. MQ-Patched is a kernel with this
patch applied.
You can see multiple hardware queues heavily hurt performance in
combination with BFQ. The patch restores the performance.
conflicts:
        block/blk-mq.c
    include/linux/elevator.h
Signed-off-by: Jan Kara jack@suse.cz
Reviewed-by: Ming Lei ming.lei@redhat.com
Signed-off-by: Jens Axboe axboe@kernel.dk
Signed-off-by: Lei Chen lei.chen@smartx.com
---
 block/blk-mq.c           | 63 +++++++++++++++++++++++++++++++++++++---
 block/kyber-iosched.c    |  1 +
 include/linux/elevator.h |  3 ++
 3 files changed, 63 insertions(+), 4 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 96debbe63ad2..2b0f0a9f40d9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1521,17 +1521,62 @@ bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
    return false;
 }
 EXPORT_SYMBOL(blk_mq_run_hw_queue);
+/*
+ * Is the request queue handled by an IO scheduler that does not respect
+ * hardware queues when dispatching?
+ */
+static bool blk_mq_has_sqsched(struct request_queue *q)
+{
+	struct elevator_queue *e = q->elevator;
-void blk_mq_run_hw_queues(struct request_queue *q, bool async)
+	if (e && e->type->ops.mq.dispatch_request &&
+	    !(e->type->elevator_features & ELEVATOR_F_MQ_AWARE))
+		return true;
+	return false;
+}
+
+/*
+ * Return prefered queue to dispatch from (if any) for non-mq aware IO
+ * scheduler.
+ */
+static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
 {
    struct blk_mq_hw_ctx *hctx;
+
+	/*
+	 * If the IO scheduler does not respect hardware queues when
+	 * dispatching, we just don't bother with multiple HW queues and
+	 * dispatch from hctx for the current CPU since running multiple queues
+	 * just causes lock contention inside the scheduler and pointless cache
+	 * bouncing.
+	 */
+	hctx = blk_mq_map_queue(q, raw_smp_processor_id());
+	if (!blk_mq_hctx_stopped(hctx))
+		return hctx;
+	return NULL;
+}
+
+
+void blk_mq_run_hw_queues(struct request_queue *q, bool async)
+{
+	struct blk_mq_hw_ctx *hctx, *sq_hctx;
    int i;
+	sq_hctx = NULL;
+	if (blk_mq_has_sqsched(q))
+		sq_hctx = blk_mq_get_sq_hctx(q);
    queue_for_each_hw_ctx(q, hctx, i) {
    	if (blk_mq_hctx_stopped(hctx))
    		continue;
-		blk_mq_run_hw_queue(hctx, async);
+		/*
+		 * Dispatch from this hctx either if there's no hctx preferred
+		 * by IO scheduler or if it has requests that bypass the
+		 * scheduler.
+		 */
+		if (!sq_hctx || sq_hctx == hctx ||
+		    !list_empty_careful(&hctx->dispatch))
+			blk_mq_run_hw_queue(hctx, async);
    }
 }
 EXPORT_SYMBOL(blk_mq_run_hw_queues);
@@ -1543,9 +1588,12 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues);
  */
 void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
 {
-	struct blk_mq_hw_ctx *hctx;
+	struct blk_mq_hw_ctx *hctx, *sq_hctx;
    int i;
+	sq_hctx = NULL;
+	if (blk_mq_has_sqsched(q))
+		sq_hctx = blk_mq_get_sq_hctx(q);
    queue_for_each_hw_ctx(q, hctx, i) {
    	if (blk_mq_hctx_stopped(hctx))
    		continue;
@@ -1559,7 +1607,14 @@ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
    	if (delayed_work_pending(&hctx->run_work))
    		continue;
-		blk_mq_delay_run_hw_queue(hctx, msecs);
+		/*
+		 * Dispatch from this hctx either if there's no hctx preferred
+		 * by IO scheduler or if it has requests that bypass the
+		 * scheduler.
+		 */
+		if (!sq_hctx || sq_hctx == hctx ||
+		    !list_empty_careful(&hctx->dispatch))
+			blk_mq_delay_run_hw_queue(hctx, msecs);
    }
 }
 EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index d8f3cb1bffa6..bc0376c338ff 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -972,6 +972,7 @@ static struct elevator_type kyber_sched = {
 #endif
    .elevator_attrs = kyber_sched_attrs,
    .elevator_name = "kyber",
+	.elevator_features = ELEVATOR_F_MQ_AWARE,
    .elevator_owner = THIS_MODULE,
 };
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index f5dbcf46e08b..f6b2f98adb0a 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -280,6 +280,9 @@ enum {
 #define rq_entry_fifo(ptr)	list_entry((ptr), struct request, queuelist)
 #define rq_fifo_clear(rq)	list_del_init(&(rq)->queuelist)
+/* Supports scheduling on multiple hardware queues */
+#define ELEVATOR_F_MQ_AWARE		(1U << 1)
+
 #else /* CONFIG_BLOCK */
static inline void load_default_elevator_module(void) { }
-- 
2.34.1


    

2024

2023

2022

2021

2020

2019

[PATCH openEuler-1.0-LTS 2/2] blk-mq: Improve performance of non-mq IO schedulers with multiple HW queues