From: Salman Qazi sqazi@google.com
mainline inclusion from mainline-v5.8-rc1 commit 28d65729b050977d8a9125e6726871e83bd22124 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6RQVT CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h...
--------------------------------
Flushes bypass the I/O scheduler and get added to hctx->dispatch in blk_mq_sched_bypass_insert. This can happen while a kworker is running hctx->run_work work item and is past the point in blk_mq_sched_dispatch_requests where hctx->dispatch is checked.
The blk_mq_do_dispatch_sched call is not guaranteed to end in bounded time, because the I/O scheduler can feed an arbitrary number of commands.
Since we have only one hctx->run_work, the commands waiting in hctx->dispatch will wait an arbitrary length of time for run_work to be rerun.
A similar phenomenon exists with dispatches from the software queue.
The solution is to poll hctx->dispatch in blk_mq_do_dispatch_sched and blk_mq_do_dispatch_ctx and return from the run_work handler and let it rerun.
Signed-off-by: Salman Qazi sqazi@google.com Reviewed-by: Ming Lei ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk
Conflicts: block/blk-mq-sched.c Signed-off-by: Li Lingfeng lilingfeng3@huawei.com Reviewed-by: Yu Kuai yukuai3@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- block/blk-mq-sched.c | 66 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 52 insertions(+), 14 deletions(-)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index fa4781865be6..0fb33abac3f6 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -87,12 +87,16 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) * Only SCSI implements .get_budget and .put_budget, and SCSI restarts * its queue by itself in its completion handler, so we don't need to * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. + * + * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to + * be run again. This is necessary to avoid starving flushes. */ -static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) +static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; LIST_HEAD(rq_list); + int ret = 0; unsigned long end = jiffies + HZ;
do { @@ -102,6 +106,11 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) !e->type->ops.mq.has_work(hctx)) break;
+ if (!list_empty_careful(&hctx->dispatch)) { + ret = -EAGAIN; + break; + } + if (!blk_mq_get_dispatch_budget(hctx)) break;
@@ -133,6 +142,8 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) break; } } while (1); + + return ret; }
static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, @@ -150,16 +161,25 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, * Only SCSI implements .get_budget and .put_budget, and SCSI restarts * its queue by itself in its completion handler, so we don't need to * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. + * + * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to + * to be run again. This is necessary to avoid starving flushes. */ -static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) +static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; LIST_HEAD(rq_list); struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); + int ret = 0;
do { struct request *rq;
+ if (!list_empty_careful(&hctx->dispatch)) { + ret = -EAGAIN; + break; + } + if (!sbitmap_any_bit_set(&hctx->ctx_map)) break;
@@ -193,22 +213,17 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
WRITE_ONCE(hctx->dispatch_from, ctx); + return ret; }
-void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) +int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; + int ret = 0; LIST_HEAD(rq_list);
- /* RCU or SRCU read lock is needed before checking quiesced flag */ - if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q) || - blk_queue_quiesced_internal(q))) - return; - - hctx->run++; - /* * If we have previous entries on our dispatch list, grab them first for * more fair dispatch. @@ -237,19 +252,42 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) blk_mq_sched_mark_restart_hctx(hctx); if (blk_mq_dispatch_rq_list(q, &rq_list, false)) { if (has_sched_dispatch) - blk_mq_do_dispatch_sched(hctx); + ret = blk_mq_do_dispatch_sched(hctx); else - blk_mq_do_dispatch_ctx(hctx); + ret = blk_mq_do_dispatch_ctx(hctx); } } else if (has_sched_dispatch) { - blk_mq_do_dispatch_sched(hctx); + ret = blk_mq_do_dispatch_sched(hctx); } else if (hctx->dispatch_busy) { /* dequeue request one by one from sw queue if queue is busy */ - blk_mq_do_dispatch_ctx(hctx); + ret = blk_mq_do_dispatch_ctx(hctx); } else { blk_mq_flush_busy_ctxs(hctx, &rq_list); blk_mq_dispatch_rq_list(q, &rq_list, false); } + + return ret; +} + +void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + + /* RCU or SRCU read lock is needed before checking quiesced flag */ + if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q) || + blk_queue_quiesced_internal(q))) + return; + + hctx->run++; + + /* + * A return of -EAGAIN is an indication that hctx->dispatch is not + * empty and we must run again in order to avoid starving flushes. + */ + if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) { + if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) + blk_mq_run_hw_queue(hctx, true); + } }
bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,