From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: 173974 CVE: NA ---------------------------
Prepare to support concurrent quiesce queue between drivers and block layer, no functional changes.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/blk-mq.c | 58 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 18 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c index ef62a83314a5d..f9b4b73a2f38d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -211,32 +211,29 @@ void blk_mq_unfreeze_queue(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
+static void __blk_mq_quiesce_queue_nowait(struct request_queue *q, + unsigned int flag) +{ + blk_queue_flag_set(flag, q); +} + /* * FIXME: replace the scsi_internal_device_*block_nowait() calls in the * mpt3sas driver such that this function can be removed. */ void blk_mq_quiesce_queue_nowait(struct request_queue *q) { - blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q); + __blk_mq_quiesce_queue_nowait(q, QUEUE_FLAG_QUIESCED); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
-/** - * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished - * @q: request queue. - * - * Note: this function does not prevent that the struct request end_io() - * callback function is invoked. Once this function is returned, we make - * sure no dispatch can happen until the queue is unquiesced via - * blk_mq_unquiesce_queue(). - */ -void blk_mq_quiesce_queue(struct request_queue *q) +static void __blk_mq_quiesce_queue(struct request_queue *q, unsigned int flag) { struct blk_mq_hw_ctx *hctx; unsigned int i; bool rcu = false;
- blk_mq_quiesce_queue_nowait(q); + __blk_mq_quiesce_queue_nowait(q, flag);
queue_for_each_hw_ctx(q, hctx, i) { if (hctx->flags & BLK_MQ_F_BLOCKING) @@ -247,15 +244,30 @@ void blk_mq_quiesce_queue(struct request_queue *q) if (rcu) synchronize_rcu(); } + +/** + * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished + * @q: request queue. + * + * Note: this function does not prevent that the struct request end_io() + * callback function is invoked. Once this function is returned, we make + * sure no dispatch can happen until the queue is unquiesced via + * blk_mq_unquiesce_queue(). + */ +void blk_mq_quiesce_queue(struct request_queue *q) +{ + __blk_mq_quiesce_queue(q, QUEUE_FLAG_QUIESCED); +} EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
-bool blk_mq_quiesce_queue_without_rcu(struct request_queue *q) +static bool __blk_mq_quiesce_queue_without_rcu(struct request_queue *q, + unsigned int flag) { struct blk_mq_hw_ctx *hctx; unsigned int i; bool rcu = false;
- blk_mq_quiesce_queue_nowait(q); + __blk_mq_quiesce_queue_nowait(q, flag);
queue_for_each_hw_ctx(q, hctx, i) { if (hctx->flags & BLK_MQ_F_BLOCKING) @@ -265,8 +277,21 @@ bool blk_mq_quiesce_queue_without_rcu(struct request_queue *q) } return rcu; } + +bool blk_mq_quiesce_queue_without_rcu(struct request_queue *q) +{ + return __blk_mq_quiesce_queue_without_rcu(q, QUEUE_FLAG_QUIESCED); +} EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_without_rcu);
+static void __blk_mq_unquiesce_queue(struct request_queue *q, unsigned int flag) +{ + blk_queue_flag_clear(flag, q); + + /* dispatch requests which are inserted during quiescing */ + blk_mq_run_hw_queues(q, true); +} + /* * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue() * @q: request queue. @@ -276,10 +301,7 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_without_rcu); */ void blk_mq_unquiesce_queue(struct request_queue *q) { - blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q); - - /* dispatch requests which are inserted during quiescing */ - blk_mq_run_hw_queues(q, true); + __blk_mq_unquiesce_queue(q, QUEUE_FLAG_QUIESCED); } EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: 173974 CVE: NA ---------------------------
Queue will be quiesced if the old or the new flag is set, and the queue will be unqiesced if both flags is cleared.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/blk-mq-sched.c | 3 ++- block/blk-mq.c | 14 +++++++++++++- block/blk-mq.h | 2 ++ include/linux/blkdev.h | 9 +++++++-- 4 files changed, 24 insertions(+), 4 deletions(-)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 3521eca1b2984..b7be8e74fab8c 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -195,7 +195,8 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) LIST_HEAD(rq_list);
/* RCU or SRCU read lock is needed before checking quiesced flag */ - if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) + if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q) || + blk_queue_quiesced_internal(q))) return;
hctx->run++; diff --git a/block/blk-mq.c b/block/blk-mq.c index f9b4b73a2f38d..8bf3da9b7178e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -260,6 +260,11 @@ void blk_mq_quiesce_queue(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
+void blk_mq_quiesce_queue_internal(struct request_queue *q) +{ + __blk_mq_quiesce_queue(q, QUEUE_FLAG_QUIESCED_INTERNAL); +} + static bool __blk_mq_quiesce_queue_without_rcu(struct request_queue *q, unsigned int flag) { @@ -305,6 +310,11 @@ void blk_mq_unquiesce_queue(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
+void blk_mq_unquiesce_queue_internal(struct request_queue *q) +{ + __blk_mq_unquiesce_queue(q, QUEUE_FLAG_QUIESCED_INTERNAL); +} + void blk_mq_wake_waiters(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; @@ -1491,6 +1501,7 @@ bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) */ hctx_lock(hctx, &srcu_idx); need_run = !blk_queue_quiesced(hctx->queue) && + !blk_queue_quiesced_internal(hctx->queue) && blk_mq_hctx_has_pending(hctx); hctx_unlock(hctx, srcu_idx);
@@ -1844,7 +1855,8 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller, * and avoid driver to try to dispatch again. */ - if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) { + if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q) || + blk_queue_quiesced_internal(q)) { run_queue = false; bypass_insert = false; goto insert; diff --git a/block/blk-mq.h b/block/blk-mq.h index bbb0c1d8849b4..88b590c245476 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -46,6 +46,8 @@ bool blk_mq_get_driver_tag(struct request *rq); struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start); void blk_mq_put_rq_ref(struct request *rq); +void blk_mq_quiesce_queue_internal(struct request_queue *q); +void blk_mq_unquiesce_queue_internal(struct request_queue *q);
/* * Internal helpers for allocating/freeing the request map diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 50f3b1eaa021f..109add33b3318 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -725,8 +725,11 @@ struct request_queue { #define QUEUE_FLAG_POLL_STATS 25 /* collecting stats for hybrid polling */ #define QUEUE_FLAG_REGISTERED 26 /* queue has been registered to a disk */ #define QUEUE_FLAG_SCSI_PASSTHROUGH 27 /* queue supports SCSI commands */ -#define QUEUE_FLAG_QUIESCED 28 /* queue has been quiesced */ +/* queue has been quiesced, used in driver */ +#define QUEUE_FLAG_QUIESCED 28 #define QUEUE_FLAG_FORECE_QUIESCE 29 /* force quiesce when cleanup queue */ +/* queue has bee quiesced, used in block layer */ +#define QUEUE_FLAG_QUIESCED_INTERNAL 30
#define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_SAME_COMP) | \ @@ -763,7 +766,9 @@ bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q); #define blk_noretry_request(rq) \ ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \ REQ_FAILFAST_DRIVER)) -#define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags) +#define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags) +#define blk_queue_quiesced_internal(q) \ + test_bit(QUEUE_FLAG_QUIESCED_INTERNAL, &(q)->queue_flags) #define blk_queue_pm_only(q) atomic_read(&(q)->pm_only) #define blk_queue_fua(q) test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags)
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: 173974 CVE: NA ---------------------------
Our test report a null-ptr-def problem:
[89470.197765] kasan: GPF could be caused by NULL-ptr deref or user memory access [89470.198896] general protection fault: 0000 [#1] SMP KASAN [89470.199706] CPU: 2 PID: 17722 Comm: kworker/u8:3 Not tainted 4.19.195-01446-gb2a62977d3e4 #1 [89470.200935] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 [89470.202247] Workqueue: nvme-reset-wq nvme_reset_work [89470.203009] RIP: 0010:kyber_has_work+0x60/0xf0 [89470.203665] Code: 8b a3 18 01 00 00 49 bd 00 00 00 00 00 fc ff df 49 8d 5c 24 08 49 8d 6c 24 48 49 83 c4 38 e8 27 1c 1a ff 48 89 d8 48 c1 e8 03 <42> 80 3c 28 00 75 68 48 3b 1b 75 50 e8 0f 1c 1a ff 48 8d 7b 08 48 [89470.206331] RSP: 0018:ffff888004c379c8 EFLAGS: 00010202 [89470.207096] RAX: 0000000000000001 RBX: 0000000000000008 RCX: ffffffffb5238989 [89470.208138] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff888101ff0be0 [89470.209206] RBP: 0000000000000048 R08: ffffed1020653380 R09: ffffed1020653380 [89470.210250] R10: 0000000000000001 R11: ffffed102065337f R12: 0000000000000038 [89470.211276] R13: dffffc0000000000 R14: 0000000000000000 R15: ffff888101ff0be8 [89470.220344] FS: 0000000000000000(0000) GS:ffff88810ed00000(0000) knlGS:0000000000000000 [89470.221184] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [89470.221775] CR2: 00007f70134c2770 CR3: 00000000b6e0e000 CR4: 00000000000006e0 [89470.222521] Call Trace: [89470.222811] ? kyber_read_lat_show+0x80/0x80 [89470.223223] blk_mq_run_hw_queue+0x26b/0x2f0 [89470.223631] blk_mq_run_hw_queues+0xe8/0x160 [89470.224064] nvme_start_queues+0x6c/0xb0 [89470.224460] nvme_start_ctrl+0x155/0x300 [89470.224864] ? down_read+0xf/0x80 [89470.225186] ? nvme_set_queue_count+0x1f0/0x1f0 [89470.225803] ? nvme_change_ctrl_state+0x83/0x2e0 [89470.226285] nvme_reset_work+0x2fd2/0x4ee0 [89470.226699] ? rb_erase_cached+0x8f8/0x19d0 [89470.227103] ? nvme_alloc_queue+0xbf0/0xbf0 [89470.227758] ? set_next_entity+0x248/0x730 [89470.228174] ? pick_next_entity+0x199/0x400 [89470.228592] ? put_prev_entity+0x4f/0x350 [89470.228976] ? pick_next_task_fair+0x7c9/0x1500 [89470.229424] ? account_entity_dequeue+0x30b/0x580 [89470.229874] ? dequeue_entity+0x29b/0xf70 [89470.230352] ? __switch_to+0x17b/0xb60 [89470.230936] ? compat_start_thread+0x80/0x80 [89470.231447] ? dequeue_task_fair+0xe4/0x1d60 [89470.231871] ? tty_ldisc_receive_buf+0xaa/0x170 [89470.232501] ? read_word_at_a_time+0xe/0x20 [89470.233195] ? strscpy+0x96/0x300 [89470.233761] process_one_work+0x706/0x1270 [89470.234438] worker_thread+0x91/0xc80 [89470.235038] ? process_one_work+0x1270/0x1270 [89470.235749] kthread+0x305/0x3c0 [89470.236292] ? kthread_park+0x1c0/0x1c0 [89470.236944] ret_from_fork+0x1f/0x30 [89470.237543] Modules linked in: [89470.238179] ---[ end trace 4ed415cdb7eafdd4 ]---
This is because blk_mq_quiesce_queue() is called concurently from driver and block layer:
t1: driver t2: block layer blk_mq_quiesce_queue ... elevator_switch blk_mq_quiesce_queue elevator_exit blk_mq_unquiesce_queue blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q); blk_mq_run_hw_queue blk_queue_quiesced blk_mq_hctx_has_pending blk_mq_sched_has_work -> elevator is not ready
Mainline fix this problem by commit e70feb8b3e68 ("blk-mq: support concurrent queue quiesce/unquiesce"), however this rely on the calls to quiesce and unquiesce queue to be balanced in all the drivers, which is hard to ensure considering out or tree modules.
Fix the problem by using a separate flag for block layer, thus quiesce queue in block layer won't affect drivers.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/blk-core.c | 2 +- block/blk-mq.c | 4 ++-- block/blk-sysfs.c | 4 ++-- block/elevator.c | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c index bb4d3da762b1e..41d0b09e9a673 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -812,7 +812,7 @@ void blk_cleanup_queue(struct request_queue *q) */ if (q->mq_ops && (blk_queue_init_done(q) || test_bit(QUEUE_FLAG_FORECE_QUIESCE, &q->queue_flags))) - blk_mq_quiesce_queue(q); + blk_mq_quiesce_queue_internal(q);
/* for synchronous bio-based driver finish in-flight integrity i/o */ blk_flush_integrity(); diff --git a/block/blk-mq.c b/block/blk-mq.c index 8bf3da9b7178e..0732bcc65f889 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3125,7 +3125,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) return -EINVAL;
blk_mq_freeze_queue(q); - blk_mq_quiesce_queue(q); + blk_mq_quiesce_queue_internal(q);
ret = 0; queue_for_each_hw_ctx(q, hctx, i) { @@ -3151,7 +3151,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) if (!ret) q->nr_requests = nr;
- blk_mq_unquiesce_queue(q); + blk_mq_unquiesce_queue_internal(q); blk_mq_unfreeze_queue(q);
return ret; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 4167f6a5c3f86..7ce092ab0f057 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -461,7 +461,7 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, */ if (q->mq_ops) { blk_mq_freeze_queue(q); - blk_mq_quiesce_queue(q); + blk_mq_quiesce_queue_internal(q); } else blk_queue_bypass_start(q);
@@ -469,7 +469,7 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, wbt_update_limits(q);
if (q->mq_ops) { - blk_mq_unquiesce_queue(q); + blk_mq_unquiesce_queue_internal(q); blk_mq_unfreeze_queue(q); } else blk_queue_bypass_end(q); diff --git a/block/elevator.c b/block/elevator.c index ddbcd36616a8d..31d2b4fb1411a 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -1013,11 +1013,11 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
if (q->mq_ops) { blk_mq_freeze_queue(q); - blk_mq_quiesce_queue(q); + blk_mq_quiesce_queue_internal(q);
err = elevator_switch_mq(q, new_e);
- blk_mq_unquiesce_queue(q); + blk_mq_unquiesce_queue_internal(q); blk_mq_unfreeze_queue(q);
return err;