From: yu kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: 34280 CVE: NA
---------------------------
tags->rqs[] will not been cleaned when free driver tag to avoid an extra store on a shared area in the per io path. But there is a window between get driver tag and write tags->rqs[], so we may see stale rq in tags->rqs[] which may have been freed, as the following case:
blk_mq_get_request blk_mq_queue_tag_busy_iter -> blk_mq_get_tag -> bt_for_each -> bt_iter -> rq = tags->rqs[] -> rq->q -> blk_mq_rq_ctx_init -> data->hctx->tags->rqs[rq->tag] = rq;
In additiion, tags->rqs[] only contains the requests that get driver tag. It is not accurate for io-scheduler case when account busy tags in part_in_flight.
To fix both of them, the blk_mq_queue_tag_busy_iter is changed in this patch to use tags->static_rqs[] instead of tags->rqs[]. We have to identify whether there is a io scheduler attached to decide to use hctx->tags or hctx->sched_tags. And we will try to get a non-zero q_usage_counter before that, then could avoid race with update nr_hw_queues, switch io-scheduler and even queue cleanup.
Add 'inflight' parameter to determine to iterate in-flight requests or just busy tags and add a new helper interface blk_mq_queue_tag_inflight_iter to iterate all of the in-flight tags and export this interface for drivers.
Signed-off-by: yu kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/blk-mq-tag.c | 77 ++++++++++++++++++++++++++++++++++++++++---------- block/blk-mq.c | 6 ++-- include/linux/blk-mq.h | 3 +- 3 files changed, 67 insertions(+), 19 deletions(-)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 41317c5..323bbca 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -216,37 +216,51 @@ struct bt_iter_data { busy_iter_fn *fn; void *data; bool reserved; + bool inflight; };
static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { struct bt_iter_data *iter_data = data; struct blk_mq_hw_ctx *hctx = iter_data->hctx; - struct blk_mq_tags *tags = hctx->tags; bool reserved = iter_data->reserved; + struct blk_mq_tags *tags; struct request *rq;
+ tags = hctx->sched_tags ? hctx->sched_tags : hctx->tags; + if (!reserved) bitnr += tags->nr_reserved_tags; - rq = tags->rqs[bitnr];
/* - * We can hit rq == NULL here, because the tagging functions - * test and set the bit before assining ->rqs[]. + * Because tags->rqs[] will not been cleaned when free driver tag + * and there is a window between get driver tag and write tags->rqs[], + * so we may see stale rq in tags->rqs[] which may have been freed. + * Using static_rqs[] is safer. */ - if (rq && rq->q == hctx->queue) + rq = tags->static_rqs[bitnr]; + + /* + * There is a small window between get tag and blk_mq_rq_ctx_init, + * so rq->q and rq->mq_hctx maybe different. + */ + if (rq && rq->q == hctx->queue && + (!iter_data->inflight || + blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)) iter_data->fn(hctx, rq, iter_data->data, reserved); return true; }
-static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt, - busy_iter_fn *fn, void *data, bool reserved) +static void bt_for_each(struct blk_mq_hw_ctx *hctx, + struct sbitmap_queue *bt, busy_iter_fn *fn, + void *data, bool reserved, bool inflight) { struct bt_iter_data iter_data = { .hctx = hctx, .fn = fn, .data = data, .reserved = reserved, + .inflight = inflight, };
sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data); @@ -314,22 +328,23 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, } EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
-void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, - void *priv) +static void __blk_mq_queue_tag_busy_iter(struct request_queue *q, + busy_iter_fn *fn, void *priv, bool inflight) { struct blk_mq_hw_ctx *hctx; int i;
/* - * __blk_mq_update_nr_hw_queues will update the nr_hw_queues and - * queue_hw_ctx after freeze the queue, so we use q_usage_counter - * to avoid race with it. + * Get a reference of the queue unless it has been zero. We use this + * to avoid the race with the code that would modify the hctxs after + * freeze and drain the queue, including updating nr_hw_queues, io + * scheduler switching and queue clean up. */ if (!percpu_ref_tryget(&q->q_usage_counter)) return;
queue_for_each_hw_ctx(q, hctx, i) { - struct blk_mq_tags *tags = hctx->tags; + struct blk_mq_tags *tags;
/* * If not software queues are currently mapped to this @@ -338,13 +353,45 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, if (!blk_mq_hw_queue_mapped(hctx)) continue;
+ tags = hctx->sched_tags ? hctx->sched_tags : hctx->tags; + if (tags->nr_reserved_tags) - bt_for_each(hctx, &tags->breserved_tags, fn, priv, true); - bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false); + bt_for_each(hctx, &tags->breserved_tags, + fn, priv, true, inflight); + bt_for_each(hctx, &tags->bitmap_tags, + fn, priv, false, inflight); + /* + * flush_rq represents the rq with REQ_PREFLUSH and REQ_FUA + * (if FUA is not supported by device) to be issued to + * device. So we need to consider it when iterate inflight + * rqs, but needn't to count it when iterate busy tags. + */ + if (inflight && + blk_mq_rq_state(hctx->fq->flush_rq) == MQ_RQ_IN_FLIGHT) + fn(hctx, hctx->fq->flush_rq, priv, false); } blk_queue_exit(q); }
+/* + * Iterate all the busy tags including pending and in-flight ones. + */ +void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, + void *priv) +{ + __blk_mq_queue_tag_busy_iter(q, fn, priv, false); +} + +/* + * Iterate all the inflight tags. + */ +void blk_mq_queue_tag_inflight_iter(struct request_queue *q, + busy_iter_fn *fn, void *priv) +{ + __blk_mq_queue_tag_busy_iter(q, fn, priv, true); +} +EXPORT_SYMBOL(blk_mq_queue_tag_inflight_iter); + static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, bool round_robin, int node) { diff --git a/block/blk-mq.c b/block/blk-mq.c index 8a7c3d8..ee07575 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -112,7 +112,7 @@ void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, struct mq_inflight mi = { .part = part, .inflight = inflight, };
inflight[0] = inflight[1] = 0; - blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); + blk_mq_queue_tag_inflight_iter(q, blk_mq_check_inflight, &mi); }
static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx, @@ -131,7 +131,7 @@ void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, struct mq_inflight mi = { .part = part, .inflight = inflight, };
inflight[0] = inflight[1] = 0; - blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi); + blk_mq_queue_tag_inflight_iter(q, blk_mq_check_inflight_rw, &mi); }
void blk_freeze_queue_start(struct request_queue *q) @@ -875,7 +875,7 @@ static void blk_mq_timeout_work(struct work_struct *work) if (!percpu_ref_tryget(&q->q_usage_counter)) return;
- blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next); + blk_mq_queue_tag_inflight_iter(q, blk_mq_check_expired, &next);
if (next != 0) { mod_timer(&q->timeout, next); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 6578070..149d411 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -320,7 +320,8 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); - +void blk_mq_queue_tag_inflight_iter(struct request_queue *q, busy_iter_fn *fn, + void *priv); int blk_mq_map_queues(struct blk_mq_tag_set *set); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
From: yu kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: 34280 CVE: NA
---------------------------
blk_mq_tagset_busy_iter() is not safe that it could get stale request in tags->rqs[]. Use blk_mq_queue_tag_inflight_iter() here.
Signed-off-by: yu kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/blk-mq-debugfs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index f0865b6..e098b79 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -427,7 +427,8 @@ struct show_busy_params { * Note: the state of a request may change while this function is in progress, * e.g. due to a concurrent blk_mq_finish_request() call. */ -static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved) +static void hctx_show_busy_rq(struct blk_mq_hw_ctx *hctx, + struct request *rq, void *data, bool reserved) { const struct show_busy_params *params = data;
@@ -442,7 +443,7 @@ static int hctx_busy_show(void *data, struct seq_file *m) struct blk_mq_hw_ctx *hctx = data; struct show_busy_params params = { .m = m, .hctx = hctx };
- blk_mq_tagset_busy_iter(hctx->queue->tag_set, hctx_show_busy_rq, + blk_mq_queue_tag_inflight_iter(hctx->queue, hctx_show_busy_rq, ¶ms);
return 0;
From: yu kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: 34280 CVE: NA
---------------------------
blk_mq_tagset_busy_iter() is not safe that it could get stale request in tags->rqs[]. Use blk_mq_queue_tag_inflight_iter() here.
Signed-off-by: yu kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/block/nbd.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 51ba8d0..9b1684a 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -763,7 +763,8 @@ static void recv_work(struct work_struct *work) kfree(args); }
-static void nbd_clear_req(struct request *req, void *data, bool reserved) +static void nbd_clear_req(struct blk_mq_hw_ctx *hctx, + struct request *req, void *data, bool reserved) { struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
@@ -777,7 +778,7 @@ static void nbd_clear_req(struct request *req, void *data, bool reserved) static void nbd_clear_que(struct nbd_device *nbd) { blk_mq_quiesce_queue(nbd->disk->queue); - blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL); + blk_mq_queue_tag_inflight_iter(nbd->disk->queue, nbd_clear_req, NULL); blk_mq_unquiesce_queue(nbd->disk->queue); dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n"); }